Skip to content

Commit f3f8d03

Browse files
committed
Adds voice classification functionality for Gemini and ElevenLabs voices, includes parsing, validation, and storage of voice metadata.
1 parent 10b71ec commit f3f8d03

File tree

4 files changed

+241
-35
lines changed

4 files changed

+241
-35
lines changed

app.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,54 @@ def get_gemini_sample(voice_name):
148148
return "Sample directory not found", 404
149149
return send_from_directory(sample_path, f"{voice_name}.mp3")
150150

151+
@app.route('/api/voice_classifications', methods=['GET'])
152+
def get_voice_classifications():
153+
"""Returns voice classifications for Gemini voices."""
154+
classifications_path = get_asset_path(os.path.join("samples", "gemini_voices", "voice_classifications.json"))
155+
if not classifications_path:
156+
return jsonify({'error': 'Classifications file not found'}), 404
157+
try:
158+
with open(classifications_path, 'r', encoding='utf-8') as f:
159+
classifications = json.load(f)
160+
# Filter out entries with errors and create a lookup dictionary
161+
classifications_dict = {}
162+
for entry in classifications:
163+
if 'error' not in entry and 'filename' in entry:
164+
classifications_dict[entry['filename']] = {
165+
'gender': entry.get('gender', 'unknown'),
166+
'age_group': entry.get('age_group', 'unknown'),
167+
'accent': entry.get('accent', 'unknown'),
168+
'speaking_style': entry.get('speaking_style', 'unknown')
169+
}
170+
return jsonify(classifications_dict)
171+
except Exception as e:
172+
logger.error(f"Error loading voice classifications: {e}")
173+
return jsonify({'error': 'Could not load classifications'}), 500
174+
175+
@app.route('/api/elevenlabs_voice_classifications', methods=['GET'])
176+
def get_elevenlabs_voice_classifications():
177+
"""Returns voice classifications for ElevenLabs voices."""
178+
classifications_path = get_asset_path(os.path.join("samples", "elevenlabs_voices", "voice_classifications.json"))
179+
if not classifications_path:
180+
return jsonify({'error': 'Classifications file not found'}), 404
181+
try:
182+
with open(classifications_path, 'r', encoding='utf-8') as f:
183+
classifications = json.load(f)
184+
# Filter out entries with errors and create a lookup dictionary by voice_name
185+
classifications_dict = {}
186+
for entry in classifications:
187+
if 'error' not in entry and 'voice_name' in entry:
188+
# Exclude gender since it's already provided by ElevenLabs API
189+
classifications_dict[entry['voice_name']] = {
190+
'age_group': entry.get('age_group', 'unknown'),
191+
'accent': entry.get('accent', 'unknown'),
192+
'speaking_style': entry.get('speaking_style', 'unknown')
193+
}
194+
return jsonify(classifications_dict)
195+
except Exception as e:
196+
logger.error(f"Error loading ElevenLabs voice classifications: {e}")
197+
return jsonify({'error': 'Could not load classifications'}), 500
198+
151199
def run_generation_task(task_id, script_text, app_settings, output_filepath, api_key):
152200
"""The target function for the generation thread."""
153201
stop_event = tasks[task_id]['stop_event']

gui.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,14 +1518,24 @@ def _run_fetch():
15181518
voices = []
15191519
for voice in data.get('voices', []):
15201520
labels = voice.get('labels', {}) if voice.get('labels') else {}
1521-
desc_parts = [p.title() for p in [labels.get('gender'), labels.get('age'), labels.get('accent')] if
1522-
p]
1523-
description = ', '.join(desc_parts) or str(voice.get('category', '')).title()
1524-
display_name = f"{voice.get('name', 'Unknown')} - {description}" if description else voice.get(
1525-
'name', 'Unknown')
1521+
# Build short description from labels (like Gemini format)
1522+
# Format: gender, age, accent, use_case (lowercase except accent)
1523+
gender = labels.get('gender', '')
1524+
age = labels.get('age', '').replace('_', ' ')
1525+
accent = labels.get('accent', '').title() # Capitalize accent
1526+
use_case = labels.get('use_case', '').replace('_', ' ')
1527+
1528+
desc_parts = [p for p in [gender, age, accent, use_case] if p]
1529+
short_description = ', '.join(desc_parts) if desc_parts else str(voice.get('category', '')).title()
1530+
1531+
# Store the full API description separately
1532+
full_description = voice.get('description', '').strip()
1533+
1534+
display_name = f"{voice.get('name', 'Unknown')} ({short_description})" if short_description else voice.get('name', 'Unknown')
15261535
voices.append({'id': voice.get('voice_id', ''), 'name': voice.get('name', 'Unknown'),
15271536
'display_name': display_name, 'category': voice.get('category', ''),
1528-
'labels': labels, 'preview_url': voice.get('preview_url', '')})
1537+
'labels': labels, 'preview_url': voice.get('preview_url', ''),
1538+
'description': full_description, 'short_description': short_description})
15291539
voices.sort(key=lambda x: x.get('name', ''))
15301540
self.elevenlabs_voices_cache = voices
15311541
self.logger.info(f"Successfully pre-fetched {len(voices)} ElevenLabs voices.")

settings_window.py

Lines changed: 125 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import json
55
import keyring
66
import sys
7+
import os
78
import customtkinter
89

910
from gui import AVAILABLE_VOICES
11+
from utils import get_asset_path
1012

1113
try:
1214
import requests
@@ -15,8 +17,6 @@
1517

1618

1719
class VoiceSettingsWindow(customtkinter.CTkToplevel):
18-
VOICE_DISPLAY_LIST = [f"{name} - {desc}" for name, desc in AVAILABLE_VOICES.items()]
19-
2020
def __init__(self, parent, current_settings, save_callback, close_callback, default_settings,
2121
preloaded_elevenlabs_voices=None,
2222
play_gemini_sample_callback=None,
@@ -28,6 +28,10 @@ def __init__(self, parent, current_settings, save_callback, close_callback, defa
2828
self.transient(parent)
2929
self.grab_set()
3030

31+
# Load Gemini voice classifications and build enriched display list
32+
self.gemini_voice_classifications = self._load_gemini_classifications()
33+
self.VOICE_DISPLAY_LIST = self._build_gemini_display_list()
34+
3135
# Fix pour le bandeau de titre sombre sur Windows
3236
if sys.platform == "win32":
3337
try:
@@ -126,6 +130,50 @@ def __init__(self, parent, current_settings, save_callback, close_callback, defa
126130
# A longer delay is more robust on slower systems or different architectures like Mac ARM.
127131
self.after(1000, self._enable_play_buttons)
128132

133+
def _load_gemini_classifications(self):
134+
"""Load Gemini voice classifications from JSON file."""
135+
classifications_path = get_asset_path(os.path.join("samples", "gemini_voices", "voice_classifications.json"))
136+
if not classifications_path:
137+
return {}
138+
139+
try:
140+
with open(classifications_path, 'r', encoding='utf-8') as f:
141+
classifications_list = json.load(f)
142+
# Convert list to dict indexed by filename
143+
classifications_dict = {}
144+
for entry in classifications_list:
145+
if 'error' not in entry and 'filename' in entry:
146+
classifications_dict[entry['filename']] = {
147+
'gender': entry.get('gender', 'unknown'),
148+
'age_group': entry.get('age_group', 'unknown'),
149+
'accent': entry.get('accent', 'unknown'),
150+
'speaking_style': entry.get('speaking_style', 'unknown')
151+
}
152+
return classifications_dict
153+
except Exception as e:
154+
logging.error(f"Error loading Gemini voice classifications: {e}")
155+
return {}
156+
157+
def _build_gemini_display_list(self):
158+
"""Build enriched display list for Gemini voices using classifications."""
159+
display_list = []
160+
for name, desc in AVAILABLE_VOICES.items():
161+
classification = self.gemini_voice_classifications.get(name, {})
162+
if classification:
163+
# Format: "Name (gender, age_group, Accent, speaking_style)"
164+
gender = classification.get('gender', '')
165+
age_group = classification.get('age_group', '')
166+
accent = classification.get('accent', '').title() # Capitalize accent
167+
speaking_style = classification.get('speaking_style', '')
168+
169+
desc_parts = [p for p in [gender, age_group, accent, speaking_style] if p]
170+
enriched_desc = ', '.join(desc_parts) if desc_parts else desc
171+
display_list.append(f"{name} ({enriched_desc})")
172+
else:
173+
# Fallback to original format if no classification
174+
display_list.append(f"{name} - {desc}")
175+
return display_list
176+
129177
def _enable_play_buttons(self):
130178
"""Active tous les boutons de lecture dans les guides vocaux."""
131179
for button in self.guide_play_buttons:
@@ -233,7 +281,25 @@ def _populate_guide_tab(self, scrollable_frame, provider):
233281

234282
voices = list(AVAILABLE_VOICES.items())
235283
for i, (name, desc) in enumerate(voices):
236-
self._create_guide_row(scrollable_frame, provider, name, f"{name} - {desc}", name)
284+
# Build enriched display for Gemini voices using classifications
285+
classification = self.gemini_voice_classifications.get(name, {})
286+
if classification:
287+
# Format: "Name (gender, age_group, Accent, speaking_style)"
288+
gender = classification.get('gender', '')
289+
age_group = classification.get('age_group', '')
290+
accent = classification.get('accent', '').title() # Capitalize accent
291+
speaking_style = classification.get('speaking_style', '')
292+
293+
desc_parts = [p for p in [gender, age_group, accent, speaking_style] if p]
294+
enriched_desc = ', '.join(desc_parts) if desc_parts else desc
295+
display_name = f"{name} ({enriched_desc})"
296+
else:
297+
# Fallback to original format if no classification
298+
display_name = f"{name} - {desc}"
299+
300+
# Pass enriched description or original desc as full_description
301+
full_description = enriched_desc if classification else desc
302+
self._create_guide_row(scrollable_frame, provider, name, display_name, name, full_description)
237303
if i < len(voices) - 1:
238304
separator = customtkinter.CTkFrame(scrollable_frame, height=1, fg_color=("gray80", "gray25"))
239305
separator.pack(fill='x', pady=5, padx=5)
@@ -242,7 +308,7 @@ def _populate_guide_tab(self, scrollable_frame, provider):
242308
if provider == "gemini":
243309
self._gemini_voices_displayed = True
244310

245-
def _create_guide_row(self, parent, provider, voice_id, display_name, play_identifier):
311+
def _create_guide_row(self, parent, provider, voice_id, display_name, play_identifier, full_description=None):
246312
# Set a fixed height for each row and prevent it from resizing.
247313
# This makes layout calculations much faster and scrolling smoother.
248314
row_frame = customtkinter.CTkFrame(parent, fg_color="transparent", height=55)
@@ -257,12 +323,26 @@ def _create_guide_row(self, parent, provider, voice_id, display_name, play_ident
257323
# Use sticky="w" to align left, the rowconfigure will handle vertical centering
258324
text_frame.grid(row=0, column=0, sticky="w", padx=(5, 10))
259325

260-
name, _, description = display_name.partition(" - ")
326+
# For display: use name with tags in parentheses (e.g., "Roger (Male, Middle Aged, American)")
327+
# For full description: show ElevenLabs description or Gemini description
328+
if " - " in display_name:
329+
name, _, description = display_name.partition(" - ")
330+
elif " (" in display_name:
331+
name, _, description = display_name.partition(" (")
332+
description = description.rstrip(")")
333+
else:
334+
name = display_name
335+
description = ""
336+
261337
customtkinter.CTkLabel(text_frame, text=name, font=customtkinter.CTkFont(weight="bold"),
262338
anchor="w").pack(anchor="w", fill="x")
263-
if description:
339+
340+
# Show full_description if provided (ElevenLabs), otherwise use short description
341+
desc_to_show = full_description if full_description else description
342+
if desc_to_show:
264343
# We still wrap, but the container's fixed height prevents layout jumps.
265-
customtkinter.CTkLabel(text_frame, text=description, anchor="w", wraplength=400,
344+
# Use justify="left" for proper text alignment in wrapped text
345+
customtkinter.CTkLabel(text_frame, text=desc_to_show, anchor="w", justify="left", wraplength=400,
266346
font=customtkinter.CTkFont(size=11)).pack(anchor="w", fill="x")
267347

268348
buttons_inner = customtkinter.CTkFrame(row_frame, fg_color="transparent")
@@ -318,7 +398,7 @@ def _load_more_elevenlabs_voices(self):
318398
if voices_to_display: # Seulement si on a des voix à afficher
319399
for voice in voices_to_display:
320400
self._create_guide_row(self.elevenlabs_scroll_frame, "elevenlabs", voice['id'],
321-
voice['display_name'], voice['preview_url'])
401+
voice['display_name'], voice['preview_url'], voice.get('description', ''))
322402
separator = customtkinter.CTkFrame(self.elevenlabs_scroll_frame, height=1,
323403
fg_color=("gray80", "gray25"))
324404
separator.pack(fill='x', pady=5, padx=5)
@@ -377,14 +457,24 @@ def fetch_voices():
377457
voices = []
378458
for voice in voices_data:
379459
labels = voice.get('labels', {}) or {}
380-
desc_parts = [p.title() for p in [labels.get('gender'), labels.get('age'), labels.get('accent')]
381-
if p]
382-
description = ', '.join(desc_parts) or str(voice.get('category', '')).title()
383-
display_name = f"{voice.get('name', 'Unknown')} - {description}" if description else voice.get(
384-
'name', 'Unknown')
460+
# Build short description from labels (like Gemini format)
461+
# Format: gender, age, accent, use_case (lowercase except accent)
462+
gender = labels.get('gender', '')
463+
age = labels.get('age', '').replace('_', ' ')
464+
accent = labels.get('accent', '').title() # Capitalize accent
465+
use_case = labels.get('use_case', '').replace('_', ' ')
466+
467+
desc_parts = [p for p in [gender, age, accent, use_case] if p]
468+
short_description = ', '.join(desc_parts) if desc_parts else str(voice.get('category', '')).title()
469+
470+
# Store the full API description separately
471+
full_description = voice.get('description', '').strip()
472+
473+
display_name = f"{voice.get('name', 'Unknown')} ({short_description})" if short_description else voice.get('name', 'Unknown')
385474
voices.append({'id': voice.get('voice_id', ''), 'name': voice.get('name', 'Unknown'),
386475
'display_name': display_name, 'category': voice.get('category', ''),
387-
'labels': labels, 'preview_url': voice.get('preview_url', '')})
476+
'labels': labels, 'preview_url': voice.get('preview_url', ''),
477+
'description': full_description, 'short_description': short_description})
388478
voices.sort(key=lambda x: x.get('name', ''))
389479
self.elevenlabs_voices = voices
390480
self.elevenlabs_voices_loaded = True
@@ -396,7 +486,9 @@ def fetch_voices():
396486
self.elevenlabs_voices, self.elevenlabs_voices_loaded = [], False
397487
finally:
398488
if not self.winfo_exists(): return
399-
self.after(100, self.populate_fields_delayed)
489+
# No need to call populate_fields_delayed here anymore
490+
# The check_voices_update() loop will handle updating the comboboxes
491+
self._loading_voices = False
400492

401493
threading.Thread(target=fetch_voices, daemon=True).start()
402494

@@ -422,18 +514,27 @@ def update_elevenlabs_comboboxes(self):
422514
try:
423515
if not self.elevenlabs_voices_loaded or not self.elevenlabs_voices or not self.winfo_exists():
424516
return
517+
logging.info(f"Updating ElevenLabs comboboxes with {len(self.elevenlabs_voices)} voices")
425518
elevenlabs_values = [voice['display_name'] for voice in self.elevenlabs_voices]
519+
updated_count = 0
426520
for row in self.entries:
427521
if 'elevenlabs_voice' in row and row['elevenlabs_voice']:
428522
try:
429523
current_value = row['elevenlabs_voice'].get()
430-
row['elevenlabs_voice']['values'] = elevenlabs_values
431-
if current_value in elevenlabs_values:
524+
row['elevenlabs_voice'].configure(values=elevenlabs_values)
525+
# If the current value is "Loading..." or not in the list, set to first voice
526+
if current_value == "Loading..." or current_value not in elevenlabs_values:
527+
if elevenlabs_values:
528+
row['elevenlabs_voice'].set(elevenlabs_values[0])
529+
elif current_value in elevenlabs_values:
432530
row['elevenlabs_voice'].set(current_value)
433-
except tk.TclError:
531+
updated_count += 1
532+
except tk.TclError as e:
533+
logging.warning(f"TclError updating combobox: {e}")
434534
continue
535+
logging.info(f"Successfully updated {updated_count} ElevenLabs comboboxes")
435536
except Exception as e:
436-
logging.warning(f"Error updating comboboxes: {e}")
537+
logging.error(f"Error updating comboboxes: {e}", exc_info=True)
437538

438539
def cancel_and_close(self):
439540
if self.close_callback: self.close_callback()
@@ -506,11 +607,14 @@ def add_row(self, speaker_name='', gemini_voice='', elevenlabs_voice=''):
506607
row_data['gemini_voice'] = gemini_combo
507608
if self.elevenlabs_api_configured:
508609
elevenlabs_values = [v['display_name'] for v in
509-
self.elevenlabs_voices] if self.elevenlabs_voices_loaded else []
610+
self.elevenlabs_voices] if self.elevenlabs_voices_loaded else ["Loading..."]
510611
elevenlabs_combo = customtkinter.CTkComboBox(row_frame, values=elevenlabs_values, width=220,
511612
state="readonly")
512613
elevenlabs_combo.pack(side=tk.LEFT, padx=(0, 10), fill='x')
513-
if elevenlabs_voice: elevenlabs_combo.set(elevenlabs_voice)
614+
if elevenlabs_voice:
615+
elevenlabs_combo.set(elevenlabs_voice)
616+
elif not self.elevenlabs_voices_loaded:
617+
elevenlabs_combo.set("Loading...")
514618
row_data['elevenlabs_voice'] = elevenlabs_combo
515619
remove_btn = customtkinter.CTkButton(row_frame, text="-", width=30,
516620
command=lambda r=row_frame: self.remove_row(r))

0 commit comments

Comments
 (0)