feat: Voice catalog with priority-based selection and MP3 audio fix

siddharthraja · claude · siddharthraja · commit f249263167db · 2025-11-17T15:32:55.000-08:00
## Summary - Centralized voice catalog system for managing TTS voices across providers - Priority-based voice selection with support/customer role separation - Fixed MP3 audio concatenation using pydub (was truncating after first turn) - Multi-language voice selection (Hindi, Hinglish, English) ## Key Changes ### Voice Catalog System - New `VoiceCatalog` class with 3-tier fallback matching (exact → flexible → default) - Added priority field to VoiceEntry for tie-breaking - Support voices (Ishan, Devansh) never used for customers - Customer voices (Ayush, Aarav, Aarti) never used for support - Language-aware selection: Devansh (Hindi-only), Ishan (Hinglish/English) ### Audio Fix - Replaced naive byte concatenation with pydub AudioSegment - Properly combines MP3 files with headers/frames intact - Verified with 44-second test conversation (5 turns) ### Provider Updates - Removed hardcoded voice mappings from Cartesia/ElevenLabs providers - OpenAI TTS now strips unsupported language parameter - PersonaService uses VoiceCatalog for all voice selection - Added languages field to CustomerPersona model ### Testing - End-to-end Cartesia TTS test passes - Voice selection priority logic validated - All 7 voice selection scenarios tested 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/vcg_cli.py b/src/vcg_cli.py
@@ -62,7 +62,8 @@ async def _generate_conversation(
 
     # Initialize services
     print("📦 Loading services...")
-    persona_service = PersonaService()
+    tts_provider = config.providers.tts.get('type', 'openai')
+    persona_service = PersonaService(tts_provider=tts_provider)
     persona_service.load_default_personas()
 
     # Get personas
diff --git a/src/voice_conversation_generator/models/persona.py b/src/voice_conversation_generator/models/persona.py
@@ -125,6 +125,7 @@ class CustomerPersona(Persona):
     goal: str = ""
     special_behavior: str = ""
     difficulty: str = "medium"  # easy, medium, hard
+    languages: List[str] = field(default_factory=lambda: ['en'])  # Languages spoken by this persona
 
     def __post_init__(self):
         """Ensure type is set to CUSTOMER"""
@@ -138,7 +139,8 @@ def to_dict(self) -> Dict[str, Any]:
             "issue": self.issue,
             "goal": self.goal,
             "special_behavior": self.special_behavior,
-            "difficulty": self.difficulty
+            "difficulty": self.difficulty,
+            "languages": self.languages
         })
         return base_dict
 
@@ -166,7 +168,8 @@ def from_dict(cls, data: Dict[str, Any]) -> 'CustomerPersona':
             issue=data.get("issue", ""),
             goal=data.get("goal", ""),
             special_behavior=data.get("special_behavior", ""),
-            difficulty=data.get("difficulty", "medium")
+            difficulty=data.get("difficulty", "medium"),
+            languages=data.get("languages", ["en"])
         )
 
 
diff --git a/src/voice_conversation_generator/providers/llm/openai.py b/src/voice_conversation_generator/providers/llm/openai.py
@@ -32,7 +32,7 @@ def __init__(self, config: Dict[str, Any]):
         )
 
         # Set default model
-        self.model = config.get('model', 'gpt-4')
+        self.model = config.get('model', 'gpt-4.1')
 
     async def generate_completion(
         self,
diff --git a/src/voice_conversation_generator/providers/tts/cartesia.py b/src/voice_conversation_generator/providers/tts/cartesia.py
@@ -106,39 +106,11 @@ async def generate_speech(
         if language not in self.SUPPORTED_LANGUAGES:
             language = self.default_language
 
-        # Map common voice names to Cartesia voice IDs
-        # For Hindi/Hinglish (hi language), use Indian voices
-        # For English/other languages, use default English voices
-        is_hindi = language in ['hi', 'hindi', 'hinglish']
-
-        voice_name_mapping_hi = {
-            # Hindi/Hinglish mappings (use Indian voices)
-            'onyx': 'fd2ada67-c2d9-4afe-b474-6386b87d8fc3',  # Ishan for support
-            'alloy': '1259b7e3-cb8a-43df-9446-30971a46b8b0',  # Devansh for customer
-            'echo': '1259b7e3-cb8a-43df-9446-30971a46b8b0',  # Devansh for male customer
-            'fable': '1259b7e3-cb8a-43df-9446-30971a46b8b0',  # Devansh for elderly male
-            'nova': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b',  # Fallback to English female (TODO: add Hindi female)
-            'shimmer': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b',  # Fallback to English female (TODO: add Hindi female)
-        }
-
-        voice_name_mapping_en = {
-            # English mappings
-            'onyx': 'a0e99841-438c-4a64-b679-ae501e7d6091',  # Professional male
-            'alloy': 'a0e99841-438c-4a64-b679-ae501e7d6091',  # Professional male
-            'echo': 'a167e0f3-df7e-4d52-a9c3-f949145efdab',  # Customer support man
-            'fable': 'a0e99841-438c-4a64-b679-ae501e7d6091',  # Professional male
-            'nova': 'f9836c6e-a0bd-460e-9d3c-f7299fa60f94',  # Professional female
-            'shimmer': '6ccbfb76-1fc6-48f7-b71d-91ac6298247b',  # Natural female
-        }
-
-        # Choose mapping based on language
-        voice_name_mapping = voice_name_mapping_hi if is_hindi else voice_name_mapping_en
-
-        # If voice_id is a name from our defaults or OpenAI mapping, resolve it
+        # Voice selection is handled by PersonaService via VoiceCatalog
+        # The voice_id should already be a valid Cartesia voice ID
+        # For backwards compatibility, check if it's a DEFAULT_VOICES key
         if voice_id in self.DEFAULT_VOICES:
             voice_id = self.DEFAULT_VOICES[voice_id]
-        elif voice_id in voice_name_mapping:
-            voice_id = voice_name_mapping[voice_id]
 
         try:
             # Generate audio using bytes streaming method
diff --git a/src/voice_conversation_generator/providers/tts/elevenlabs.py b/src/voice_conversation_generator/providers/tts/elevenlabs.py
@@ -65,14 +65,10 @@ async def generate_speech(
         if not self.available:
             raise RuntimeError("ElevenLabs client not available")
 
-        # Determine voice ID
+        # Voice selection is handled by PersonaService via VoiceCatalog
+        # The voice_id should already be a valid ElevenLabs voice ID
         voice_id = voice_config.voice_id or self.default_voice_id
 
-        # Check for speaker type hint in kwargs
-        speaker_type = kwargs.get('speaker_type', 'support')
-        if not voice_config.voice_id and speaker_type in self.DEFAULT_VOICES:
-            voice_id = self.DEFAULT_VOICES[speaker_type]
-
         # Build voice settings
         voice_settings = {
             "stability": voice_config.stability,
diff --git a/src/voice_conversation_generator/providers/tts/openai.py b/src/voice_conversation_generator/providers/tts/openai.py
@@ -53,10 +53,16 @@ async def generate_speech(
             Audio data as bytes (MP3 format)
         """
         # Use voice config or defaults
+        # Voice selection is handled by PersonaService via VoiceCatalog
+        # For OpenAI, voice_id and voice_name are the same (e.g., 'onyx', 'echo')
         model = voice_config.model or self.default_model
-        voice = voice_config.voice_name or self.default_voice
+        voice = voice_config.voice_id or voice_config.voice_name or self.default_voice
         speed = voice_config.speed
 
+        # OpenAI TTS doesn't support language parameter - it auto-detects
+        # Remove it from kwargs if present
+        kwargs.pop('language', None)
+
         # Validate voice
         if voice not in self.SUPPORTED_VOICES:
             voice = self.default_voice
diff --git a/src/voice_conversation_generator/services/__init__.py b/src/voice_conversation_generator/services/__init__.py
@@ -4,9 +4,13 @@
 from .orchestrator import ConversationOrchestrator
 from .persona_service import PersonaService
 from .provider_factory import ProviderFactory
+from .voice_catalog import VoiceCatalog, VoiceEntry, get_voice_catalog
 
 __all__ = [
     "ConversationOrchestrator",
     "PersonaService",
-    "ProviderFactory"
+    "ProviderFactory",
+    "VoiceCatalog",
+    "VoiceEntry",
+    "get_voice_catalog"
 ]
diff --git a/src/voice_conversation_generator/services/orchestrator.py b/src/voice_conversation_generator/services/orchestrator.py
@@ -367,15 +367,39 @@ async def save_conversation(
         # Combine audio if requested
         combined_audio = None
         if combine_audio and any(t.audio_data for t in conversation.turns):
-            # Combine all audio segments
+            # Combine all audio segments using pydub
             audio_segments = []
             for turn in conversation.turns:
                 if turn.audio_data:
                     audio_segments.append(turn.audio_data)
 
             if audio_segments:
-                # Simple concatenation - in production, might add silence gaps
-                combined_audio = b"".join(audio_segments)
+                # Use pydub to properly combine MP3 files
+                try:
+                    from pydub import AudioSegment
+                    import io
+
+                    # Load each MP3 segment
+                    combined = None
+                    for audio_bytes in audio_segments:
+                        segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
+                        if combined is None:
+                            combined = segment
+                        else:
+                            combined += segment  # Properly concatenate audio
+
+                    # Export combined audio as MP3
+                    if combined:
+                        output_buffer = io.BytesIO()
+                        combined.export(output_buffer, format='mp3', bitrate='128k')
+                        combined_audio = output_buffer.getvalue()
+
+                except ImportError:
+                    print("Warning: pydub not available. Audio combination may not work correctly.")
+                    combined_audio = b"".join(audio_segments)  # Fallback
+                except Exception as e:
+                    print(f"Warning: Audio combination failed: {e}. Using fallback.")
+                    combined_audio = b"".join(audio_segments)  # Fallback
 
         # Save to storage
         storage_paths = await self.storage.save_conversation(
diff --git a/src/voice_conversation_generator/services/persona_service.py b/src/voice_conversation_generator/services/persona_service.py
@@ -11,20 +11,24 @@
     EmotionalState,
     VoiceConfig
 )
+from .voice_catalog import get_voice_catalog
 
 
 class PersonaService:
     """Service for managing personas"""
 
-    def __init__(self, personas_dir: str = "personas"):
+    def __init__(self, personas_dir: str = "personas", tts_provider: str = "openai"):
         """Initialize persona service
 
         Args:
             personas_dir: Directory containing persona definitions
+            tts_provider: TTS provider to use for voice selection (default: 'openai')
         """
         self.personas_dir = Path(personas_dir)
         self.customer_personas: Dict[str, CustomerPersona] = {}
         self.support_personas: Dict[str, SupportPersona] = {}
+        self.tts_provider = tts_provider
+        self.voice_catalog = get_voice_catalog()
 
     def load_default_personas(self):
         """Load default personas from the existing scenarios"""
@@ -54,6 +58,21 @@ def _load_support_persona_from_file(self):
             prompt_text = """You are a helpful customer support agent.
 Be professional, empathetic, and solution-focused."""
 
+        # Query voice catalog for support agent (Hinglish support for Indian context)
+        support_voice = self.voice_catalog.get_voice(
+            provider=self.tts_provider,
+            languages=["hi", "en"],  # Hinglish support
+            accent="india",
+            gender="male",
+            persona_type="support_agent"
+        )
+
+        # Create voice config from catalog
+        voice_config = VoiceConfig(provider=self.tts_provider, speed=1.0)
+        if support_voice:
+            voice_config.voice_id = support_voice.voice_id
+            voice_config.voice_name = support_voice.name
+
         # Create default support persona
         support_persona = SupportPersona(
             id="default_support",
@@ -72,11 +91,7 @@ def _load_support_persona_from_file(self):
                 "Offer payment plans for amounts over ₹5000",
                 "Escalate to supervisor for refund requests over ₹10000"
             ],
-            voice_config=VoiceConfig(
-                provider="openai",
-                voice_name="onyx",
-                speed=1.0
-            )
+            voice_config=voice_config
         )
 
         self.support_personas["default"] = support_persona
@@ -94,17 +109,19 @@ def _load_default_customer_personas(self):
                 "issue": "School fee payment failed due to technical error",
                 "goal": "Understand the issue and make the payment",
                 "special_behavior": "",
-                "difficulty": "easy"
+                "difficulty": "easy",
+                "languages": ["hi", "en"]  # Hinglish speaker
             },
             "angry_insufficient_funds": {
                 "name": "Angry Parent - Financial Stress",
                 "customer_name": "प्रिया गुप्ता",  # Priya Gupta
-                "personality": "Frustrated parent dealing with financial stress",
+                "personality": "Frustrated female parent dealing with financial stress",
                 "emotional_state": "angry",
                 "issue": "Payment failed due to insufficient funds, but angry about repeated calls",
                 "goal": "Express frustration and potentially avoid immediate payment",
                 "special_behavior": "Start angry but may calm down if agent is empathetic",
-                "difficulty": "hard"
+                "difficulty": "hard",
+                "languages": ["hi", "en"]  # Hinglish speaker
             },
             "wrong_person_family": {
                 "name": "Wrong Person - Wife Takes Message",
@@ -124,7 +141,8 @@ def _load_default_customer_personas(self):
                 "issue": "Doesn't understand online payments, usually son handles it",
                 "goal": "Understand what's happening and get help from son",
                 "special_behavior": "Mix Hindi and English. Ask agent to speak slowly. Mention 'मेरा बेटा' (my son) handles these things",
-                "difficulty": "medium"
+                "difficulty": "medium",
+                "languages": ["hi", "en"]  # Hinglish speaker (primarily Hindi)
             },
             "financial_hardship": {
                 "name": "Financial Hardship - Needs Help",
@@ -185,15 +203,34 @@ def _load_default_customer_personas(self):
                 EmotionalState.NEUTRAL
             )
 
-            # Determine voice config based on personality
-            voice_config = VoiceConfig(provider="openai", speed=1.0)
-            if "female" in scenario_data["personality"].lower() or "wife" in scenario_data["personality"].lower():
-                voice_config.voice_name = "nova"  # Female voice
-            elif "elderly" in scenario_data["personality"].lower():
-                voice_config.voice_name = "fable"  # Older sounding voice
-                voice_config.speed = 0.9  # Slightly slower
-            else:
-                voice_config.voice_name = "echo"  # Default male voice
+            # Determine gender from personality
+            personality_lower = scenario_data["personality"].lower()
+            gender = "female" if ("female" in personality_lower or "wife" in personality_lower) else "male"
+
+            # Get languages from scenario data (default to English)
+            languages = scenario_data.get("languages", ["en"])
+
+            # Determine accent/country (default to India for this project)
+            accent = "india" if any(lang in ["hi", "hindi"] for lang in languages) else "us"
+
+            # Query voice catalog for best matching voice
+            voice_entry = self.voice_catalog.get_voice(
+                provider=self.tts_provider,
+                languages=languages,
+                accent=accent,
+                gender=gender,
+                persona_type="customer"
+            )
+
+            # Create voice config from catalog entry
+            voice_config = VoiceConfig(provider=self.tts_provider, speed=1.0)
+            if voice_entry:
+                voice_config.voice_id = voice_entry.voice_id
+                voice_config.voice_name = voice_entry.name
+
+            # Adjust speed for elderly personas
+            if "elderly" in personality_lower:
+                voice_config.speed = 0.9
 
             persona = CustomerPersona(
                 id=scenario_id,
@@ -204,6 +241,7 @@ def _load_default_customer_personas(self):
                 goal=scenario_data["goal"],
                 special_behavior=scenario_data.get("special_behavior", ""),
                 difficulty=scenario_data.get("difficulty", "medium"),
+                languages=languages,  # Include languages from scenario
                 voice_config=voice_config
             )
 
diff --git a/src/voice_conversation_generator/services/voice_catalog.py b/src/voice_conversation_generator/services/voice_catalog.py

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ def __init__(self, config: Dict[str, Any]):`
`32`	`32`	`)`
`33`	`33`
`34`	`34`	`# Set default model`
`35`		`- self.model = config.get('model', 'gpt-4')`
	`35`	`+ self.model = config.get('model', 'gpt-4.1')`
`36`	`36`
`37`	`37`	`async def generate_completion(`
`38`	`38`	`self,`