Skip to content

Commit f249263

Browse files
siddharthrajaclaude
andcommitted
feat: Voice catalog with priority-based selection and MP3 audio fix
## Summary - Centralized voice catalog system for managing TTS voices across providers - Priority-based voice selection with support/customer role separation - Fixed MP3 audio concatenation using pydub (was truncating after first turn) - Multi-language voice selection (Hindi, Hinglish, English) ## Key Changes ### Voice Catalog System - New `VoiceCatalog` class with 3-tier fallback matching (exact → flexible → default) - Added priority field to VoiceEntry for tie-breaking - Support voices (Ishan, Devansh) never used for customers - Customer voices (Ayush, Aarav, Aarti) never used for support - Language-aware selection: Devansh (Hindi-only), Ishan (Hinglish/English) ### Audio Fix - Replaced naive byte concatenation with pydub AudioSegment - Properly combines MP3 files with headers/frames intact - Verified with 44-second test conversation (5 turns) ### Provider Updates - Removed hardcoded voice mappings from Cartesia/ElevenLabs providers - OpenAI TTS now strips unsupported language parameter - PersonaService uses VoiceCatalog for all voice selection - Added languages field to CustomerPersona model ### Testing - End-to-end Cartesia TTS test passes - Voice selection priority logic validated - All 7 voice selection scenarios tested 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 9baba4e commit f249263

File tree

10 files changed

+586
-65
lines changed

10 files changed

+586
-65
lines changed

src/vcg_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ async def _generate_conversation(
6262

6363
# Initialize services
6464
print("📦 Loading services...")
65-
persona_service = PersonaService()
65+
tts_provider = config.providers.tts.get('type', 'openai')
66+
persona_service = PersonaService(tts_provider=tts_provider)
6667
persona_service.load_default_personas()
6768

6869
# Get personas

src/voice_conversation_generator/models/persona.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ class CustomerPersona(Persona):
125125
goal: str = ""
126126
special_behavior: str = ""
127127
difficulty: str = "medium" # easy, medium, hard
128+
languages: List[str] = field(default_factory=lambda: ['en']) # Languages spoken by this persona
128129

129130
def __post_init__(self):
130131
"""Ensure type is set to CUSTOMER"""
@@ -138,7 +139,8 @@ def to_dict(self) -> Dict[str, Any]:
138139
"issue": self.issue,
139140
"goal": self.goal,
140141
"special_behavior": self.special_behavior,
141-
"difficulty": self.difficulty
142+
"difficulty": self.difficulty,
143+
"languages": self.languages
142144
})
143145
return base_dict
144146

@@ -166,7 +168,8 @@ def from_dict(cls, data: Dict[str, Any]) -> 'CustomerPersona':
166168
issue=data.get("issue", ""),
167169
goal=data.get("goal", ""),
168170
special_behavior=data.get("special_behavior", ""),
169-
difficulty=data.get("difficulty", "medium")
171+
difficulty=data.get("difficulty", "medium"),
172+
languages=data.get("languages", ["en"])
170173
)
171174

172175

src/voice_conversation_generator/providers/llm/openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(self, config: Dict[str, Any]):
3232
)
3333

3434
# Set default model
35-
self.model = config.get('model', 'gpt-4')
35+
self.model = config.get('model', 'gpt-4.1')
3636

3737
async def generate_completion(
3838
self,

src/voice_conversation_generator/providers/tts/cartesia.py

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -106,39 +106,11 @@ async def generate_speech(
106106
if language not in self.SUPPORTED_LANGUAGES:
107107
language = self.default_language
108108

109-
# Map common voice names to Cartesia voice IDs
110-
# For Hindi/Hinglish (hi language), use Indian voices
111-
# For English/other languages, use default English voices
112-
is_hindi = language in ['hi', 'hindi', 'hinglish']
113-
114-
voice_name_mapping_hi = {
115-
# Hindi/Hinglish mappings (use Indian voices)
116-
'onyx': 'fd2ada67-c2d9-4afe-b474-6386b87d8fc3', # Ishan for support
117-
'alloy': '1259b7e3-cb8a-43df-9446-30971a46b8b0', # Devansh for customer
118-
'echo': '1259b7e3-cb8a-43df-9446-30971a46b8b0', # Devansh for male customer
119-
'fable': '1259b7e3-cb8a-43df-9446-30971a46b8b0', # Devansh for elderly male
120-
'nova': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b', # Fallback to English female (TODO: add Hindi female)
121-
'shimmer': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b', # Fallback to English female (TODO: add Hindi female)
122-
}
123-
124-
voice_name_mapping_en = {
125-
# English mappings
126-
'onyx': 'a0e99841-438c-4a64-b679-ae501e7d6091', # Professional male
127-
'alloy': 'a0e99841-438c-4a64-b679-ae501e7d6091', # Professional male
128-
'echo': 'a167e0f3-df7e-4d52-a9c3-f949145efdab', # Customer support man
129-
'fable': 'a0e99841-438c-4a64-b679-ae501e7d6091', # Professional male
130-
'nova': 'f9836c6e-a0bd-460e-9d3c-f7299fa60f94', # Professional female
131-
'shimmer': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b', # Natural female
132-
}
133-
134-
# Choose mapping based on language
135-
voice_name_mapping = voice_name_mapping_hi if is_hindi else voice_name_mapping_en
136-
137-
# If voice_id is a name from our defaults or OpenAI mapping, resolve it
109+
# Voice selection is handled by PersonaService via VoiceCatalog
110+
# The voice_id should already be a valid Cartesia voice ID
111+
# For backwards compatibility, check if it's a DEFAULT_VOICES key
138112
if voice_id in self.DEFAULT_VOICES:
139113
voice_id = self.DEFAULT_VOICES[voice_id]
140-
elif voice_id in voice_name_mapping:
141-
voice_id = voice_name_mapping[voice_id]
142114

143115
try:
144116
# Generate audio using bytes streaming method

src/voice_conversation_generator/providers/tts/elevenlabs.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,10 @@ async def generate_speech(
6565
if not self.available:
6666
raise RuntimeError("ElevenLabs client not available")
6767

68-
# Determine voice ID
68+
# Voice selection is handled by PersonaService via VoiceCatalog
69+
# The voice_id should already be a valid ElevenLabs voice ID
6970
voice_id = voice_config.voice_id or self.default_voice_id
7071

71-
# Check for speaker type hint in kwargs
72-
speaker_type = kwargs.get('speaker_type', 'support')
73-
if not voice_config.voice_id and speaker_type in self.DEFAULT_VOICES:
74-
voice_id = self.DEFAULT_VOICES[speaker_type]
75-
7672
# Build voice settings
7773
voice_settings = {
7874
"stability": voice_config.stability,

src/voice_conversation_generator/providers/tts/openai.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,16 @@ async def generate_speech(
5353
Audio data as bytes (MP3 format)
5454
"""
5555
# Use voice config or defaults
56+
# Voice selection is handled by PersonaService via VoiceCatalog
57+
# For OpenAI, voice_id and voice_name are the same (e.g., 'onyx', 'echo')
5658
model = voice_config.model or self.default_model
57-
voice = voice_config.voice_name or self.default_voice
59+
voice = voice_config.voice_id or voice_config.voice_name or self.default_voice
5860
speed = voice_config.speed
5961

62+
# OpenAI TTS doesn't support language parameter - it auto-detects
63+
# Remove it from kwargs if present
64+
kwargs.pop('language', None)
65+
6066
# Validate voice
6167
if voice not in self.SUPPORTED_VOICES:
6268
voice = self.default_voice

src/voice_conversation_generator/services/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
from .orchestrator import ConversationOrchestrator
55
from .persona_service import PersonaService
66
from .provider_factory import ProviderFactory
7+
from .voice_catalog import VoiceCatalog, VoiceEntry, get_voice_catalog
78

89
__all__ = [
910
"ConversationOrchestrator",
1011
"PersonaService",
11-
"ProviderFactory"
12+
"ProviderFactory",
13+
"VoiceCatalog",
14+
"VoiceEntry",
15+
"get_voice_catalog"
1216
]

src/voice_conversation_generator/services/orchestrator.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -367,15 +367,39 @@ async def save_conversation(
367367
# Combine audio if requested
368368
combined_audio = None
369369
if combine_audio and any(t.audio_data for t in conversation.turns):
370-
# Combine all audio segments
370+
# Combine all audio segments using pydub
371371
audio_segments = []
372372
for turn in conversation.turns:
373373
if turn.audio_data:
374374
audio_segments.append(turn.audio_data)
375375

376376
if audio_segments:
377-
# Simple concatenation - in production, might add silence gaps
378-
combined_audio = b"".join(audio_segments)
377+
# Use pydub to properly combine MP3 files
378+
try:
379+
from pydub import AudioSegment
380+
import io
381+
382+
# Load each MP3 segment
383+
combined = None
384+
for audio_bytes in audio_segments:
385+
segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
386+
if combined is None:
387+
combined = segment
388+
else:
389+
combined += segment # Properly concatenate audio
390+
391+
# Export combined audio as MP3
392+
if combined:
393+
output_buffer = io.BytesIO()
394+
combined.export(output_buffer, format='mp3', bitrate='128k')
395+
combined_audio = output_buffer.getvalue()
396+
397+
except ImportError:
398+
print("Warning: pydub not available. Audio combination may not work correctly.")
399+
combined_audio = b"".join(audio_segments) # Fallback
400+
except Exception as e:
401+
print(f"Warning: Audio combination failed: {e}. Using fallback.")
402+
combined_audio = b"".join(audio_segments) # Fallback
379403

380404
# Save to storage
381405
storage_paths = await self.storage.save_conversation(

src/voice_conversation_generator/services/persona_service.py

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,24 @@
1111
EmotionalState,
1212
VoiceConfig
1313
)
14+
from .voice_catalog import get_voice_catalog
1415

1516

1617
class PersonaService:
1718
"""Service for managing personas"""
1819

19-
def __init__(self, personas_dir: str = "personas"):
20+
def __init__(self, personas_dir: str = "personas", tts_provider: str = "openai"):
2021
"""Initialize persona service
2122
2223
Args:
2324
personas_dir: Directory containing persona definitions
25+
tts_provider: TTS provider to use for voice selection (default: 'openai')
2426
"""
2527
self.personas_dir = Path(personas_dir)
2628
self.customer_personas: Dict[str, CustomerPersona] = {}
2729
self.support_personas: Dict[str, SupportPersona] = {}
30+
self.tts_provider = tts_provider
31+
self.voice_catalog = get_voice_catalog()
2832

2933
def load_default_personas(self):
3034
"""Load default personas from the existing scenarios"""
@@ -54,6 +58,21 @@ def _load_support_persona_from_file(self):
5458
prompt_text = """You are a helpful customer support agent.
5559
Be professional, empathetic, and solution-focused."""
5660

61+
# Query voice catalog for support agent (Hinglish support for Indian context)
62+
support_voice = self.voice_catalog.get_voice(
63+
provider=self.tts_provider,
64+
languages=["hi", "en"], # Hinglish support
65+
accent="india",
66+
gender="male",
67+
persona_type="support_agent"
68+
)
69+
70+
# Create voice config from catalog
71+
voice_config = VoiceConfig(provider=self.tts_provider, speed=1.0)
72+
if support_voice:
73+
voice_config.voice_id = support_voice.voice_id
74+
voice_config.voice_name = support_voice.name
75+
5776
# Create default support persona
5877
support_persona = SupportPersona(
5978
id="default_support",
@@ -72,11 +91,7 @@ def _load_support_persona_from_file(self):
7291
"Offer payment plans for amounts over ₹5000",
7392
"Escalate to supervisor for refund requests over ₹10000"
7493
],
75-
voice_config=VoiceConfig(
76-
provider="openai",
77-
voice_name="onyx",
78-
speed=1.0
79-
)
94+
voice_config=voice_config
8095
)
8196

8297
self.support_personas["default"] = support_persona
@@ -94,17 +109,19 @@ def _load_default_customer_personas(self):
94109
"issue": "School fee payment failed due to technical error",
95110
"goal": "Understand the issue and make the payment",
96111
"special_behavior": "",
97-
"difficulty": "easy"
112+
"difficulty": "easy",
113+
"languages": ["hi", "en"] # Hinglish speaker
98114
},
99115
"angry_insufficient_funds": {
100116
"name": "Angry Parent - Financial Stress",
101117
"customer_name": "प्रिया गुप्ता", # Priya Gupta
102-
"personality": "Frustrated parent dealing with financial stress",
118+
"personality": "Frustrated female parent dealing with financial stress",
103119
"emotional_state": "angry",
104120
"issue": "Payment failed due to insufficient funds, but angry about repeated calls",
105121
"goal": "Express frustration and potentially avoid immediate payment",
106122
"special_behavior": "Start angry but may calm down if agent is empathetic",
107-
"difficulty": "hard"
123+
"difficulty": "hard",
124+
"languages": ["hi", "en"] # Hinglish speaker
108125
},
109126
"wrong_person_family": {
110127
"name": "Wrong Person - Wife Takes Message",
@@ -124,7 +141,8 @@ def _load_default_customer_personas(self):
124141
"issue": "Doesn't understand online payments, usually son handles it",
125142
"goal": "Understand what's happening and get help from son",
126143
"special_behavior": "Mix Hindi and English. Ask agent to speak slowly. Mention 'मेरा बेटा' (my son) handles these things",
127-
"difficulty": "medium"
144+
"difficulty": "medium",
145+
"languages": ["hi", "en"] # Hinglish speaker (primarily Hindi)
128146
},
129147
"financial_hardship": {
130148
"name": "Financial Hardship - Needs Help",
@@ -185,15 +203,34 @@ def _load_default_customer_personas(self):
185203
EmotionalState.NEUTRAL
186204
)
187205

188-
# Determine voice config based on personality
189-
voice_config = VoiceConfig(provider="openai", speed=1.0)
190-
if "female" in scenario_data["personality"].lower() or "wife" in scenario_data["personality"].lower():
191-
voice_config.voice_name = "nova" # Female voice
192-
elif "elderly" in scenario_data["personality"].lower():
193-
voice_config.voice_name = "fable" # Older sounding voice
194-
voice_config.speed = 0.9 # Slightly slower
195-
else:
196-
voice_config.voice_name = "echo" # Default male voice
206+
# Determine gender from personality
207+
personality_lower = scenario_data["personality"].lower()
208+
gender = "female" if ("female" in personality_lower or "wife" in personality_lower) else "male"
209+
210+
# Get languages from scenario data (default to English)
211+
languages = scenario_data.get("languages", ["en"])
212+
213+
# Determine accent/country (default to India for this project)
214+
accent = "india" if any(lang in ["hi", "hindi"] for lang in languages) else "us"
215+
216+
# Query voice catalog for best matching voice
217+
voice_entry = self.voice_catalog.get_voice(
218+
provider=self.tts_provider,
219+
languages=languages,
220+
accent=accent,
221+
gender=gender,
222+
persona_type="customer"
223+
)
224+
225+
# Create voice config from catalog entry
226+
voice_config = VoiceConfig(provider=self.tts_provider, speed=1.0)
227+
if voice_entry:
228+
voice_config.voice_id = voice_entry.voice_id
229+
voice_config.voice_name = voice_entry.name
230+
231+
# Adjust speed for elderly personas
232+
if "elderly" in personality_lower:
233+
voice_config.speed = 0.9
197234

198235
persona = CustomerPersona(
199236
id=scenario_id,
@@ -204,6 +241,7 @@ def _load_default_customer_personas(self):
204241
goal=scenario_data["goal"],
205242
special_behavior=scenario_data.get("special_behavior", ""),
206243
difficulty=scenario_data.get("difficulty", "medium"),
244+
languages=languages, # Include languages from scenario
207245
voice_config=voice_config
208246
)
209247

0 commit comments

Comments
 (0)