Skip to content

Commit 558f095

Browse files
committed
TTS
1 parent 6b6eeab commit 558f095

File tree

12 files changed

+2297
-156
lines changed

12 files changed

+2297
-156
lines changed

.env.example

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,18 +68,59 @@ FAST_MODE_PRESERVE_IMAGES=true
6868
DEBUG_MODE=false
6969

7070
# TTS (Text-to-Speech) Configuration
71-
# Generate audio narration of translated documents using Edge-TTS (Microsoft voices)
71+
# Generate audio narration of translated documents
7272
# Requires ffmpeg installed on the system for Opus encoding
7373
TTS_ENABLED=false
74-
# TTS Provider (currently only edge-tts is supported)
74+
75+
# TTS Provider Selection
76+
# Available providers:
77+
# - edge-tts: Microsoft Edge neural voices (free, cloud-based, no GPU required)
78+
# - chatterbox: Chatterbox TTS (local, GPU-accelerated, voice cloning support)
7579
TTS_PROVIDER=edge-tts
80+
7681
# Voice selection (leave empty for auto-selection based on target language)
77-
# Examples: zh-CN-XiaoxiaoNeural (Chinese female), zh-CN-YunxiNeural (Chinese male)
78-
# See all voices: https://speech.microsoft.com/portal/voicegallery
82+
# Edge-TTS examples: zh-CN-XiaoxiaoNeural (Chinese female), zh-CN-YunxiNeural (Chinese male)
83+
# Chatterbox: Uses language codes (e.g., "en", "zh", "fr") - see CHATTERBOX_VOICES below
84+
# See Edge-TTS voices: https://speech.microsoft.com/portal/voicegallery
7985
TTS_VOICE=
86+
8087
# Speech rate adjustment (-50% to +100%, e.g., "+10%", "-20%")
8188
TTS_RATE=+0%
8289
# Opus bitrate for output audio (e.g., 48k, 64k, 96k, 128k)
8390
TTS_BITRATE=64k
8491
# Output format (opus recommended for compact file size)
85-
TTS_OUTPUT_FORMAT=opus
92+
TTS_OUTPUT_FORMAT=opus
93+
94+
# ===== Chatterbox TTS Configuration =====
95+
# Chatterbox is a GPU-accelerated local TTS with voice cloning capabilities
96+
# GitHub: https://github.com/resemble-ai/chatterbox
97+
# Install: pip install chatterbox-tts torch torchaudio
98+
99+
# GPU/CUDA Requirements:
100+
# - NVIDIA GPU with CUDA support (recommended: 6GB+ VRAM)
101+
# - CUDA Toolkit 11.8 or 12.x installed
102+
# - PyTorch with CUDA support
103+
# - Falls back to CPU if no GPU available (significantly slower)
104+
105+
# Voice prompt for voice cloning (optional)
106+
# Path to a reference audio file (WAV, MP3, etc.) for voice cloning
107+
# Leave empty to use the default Chatterbox voice
108+
TTS_VOICE_PROMPT_PATH=
109+
110+
# Emotion exaggeration level (0.0 to 1.0)
111+
# 0.0 = neutral/flat, 1.0 = highly expressive
112+
# Default: 0.5 for balanced expressiveness
113+
TTS_EXAGGERATION=0.5
114+
115+
# Classifier-free guidance weight (0.0 to 1.0)
116+
# Higher values = more stable/predictable output
117+
# Lower values = more varied but potentially less consistent
118+
# Default: 0.5 for balanced stability
119+
TTS_CFG_WEIGHT=0.5
120+
121+
# Chatterbox supported languages (23 languages):
122+
# en (English), es (Spanish), fr (French), de (German), it (Italian),
123+
# pt (Portuguese), pl (Polish), tr (Turkish), ru (Russian), nl (Dutch),
124+
# cs (Czech), ar (Arabic), zh (Chinese), ja (Japanese), hu (Hungarian),
125+
# ko (Korean), hi (Hindi), vi (Vietnamese), sv (Swedish), da (Danish),
126+
# fi (Finnish), id (Indonesian), el (Greek)

requirements.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,10 @@ python-dotenv
1010
aiofiles
1111
tiktoken>=0.5.0
1212
edge-tts>=6.1.9
13+
14+
# Chatterbox TTS (GPU-accelerated local TTS) - OPTIONAL
15+
# These dependencies have strict numpy version requirements that may conflict.
16+
# Install separately if needed:
17+
# pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
18+
# pip install chatterbox-tts
19+
# Requires CUDA-capable GPU for optimal performance (CPU fallback available but slow)

src/api/blueprints/tts_routes.py

Lines changed: 245 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,32 @@
11
"""
22
TTS (Text-to-Speech) routes for generating audio from existing files
3+
4+
Supports multiple TTS providers:
5+
- edge-tts: Microsoft Edge neural voices (cloud-based)
6+
- chatterbox: ResembleAI's local GPU-accelerated TTS with voice cloning
37
"""
48
import os
59
import asyncio
610
import logging
711
import threading
812
import uuid
913
from flask import Blueprint, request, jsonify, current_app
10-
11-
from src.tts.tts_config import TTSConfig
14+
from werkzeug.utils import secure_filename
15+
16+
from src.tts.tts_config import TTSConfig, CHATTERBOX_VOICES, DEFAULT_VOICES
17+
from src.tts.providers import (
18+
is_chatterbox_available,
19+
get_gpu_status,
20+
CHATTERBOX_LANGUAGES,
21+
)
1222
from src.utils.file_utils import generate_tts_for_translation
1323
from src.api.services import FileService
1424

1525
logger = logging.getLogger(__name__)
1626

27+
# Allowed audio extensions for voice prompt upload
28+
ALLOWED_VOICE_PROMPT_EXTENSIONS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a'}
29+
1730

1831
def create_tts_blueprint(output_dir, socketio):
1932
"""
@@ -133,10 +146,15 @@ def generate_tts():
133146
{
134147
"filename": "translated_book.epub",
135148
"target_language": "Chinese",
149+
"tts_provider": "edge-tts", // or "chatterbox"
136150
"tts_voice": "", // Optional, auto-select if empty
137151
"tts_rate": "+0%",
138152
"tts_format": "opus",
139-
"tts_bitrate": "64k"
153+
"tts_bitrate": "64k",
154+
// Chatterbox-specific options:
155+
"tts_voice_prompt_path": "", // Path to audio file for voice cloning
156+
"tts_exaggeration": 0.5, // Emotion level (0.0-1.0)
157+
"tts_cfg_weight": 0.5 // Classifier-free guidance weight
140158
}
141159
"""
142160
try:
@@ -157,17 +175,30 @@ def generate_tts():
157175

158176
# Get TTS configuration from request
159177
target_language = data.get('target_language', 'English')
178+
provider = data.get('tts_provider', 'edge-tts')
179+
180+
# Validate provider choice
181+
if provider == 'chatterbox' and not is_chatterbox_available():
182+
return jsonify({
183+
"error": "Chatterbox TTS is not available",
184+
"details": "Missing dependencies: torch, chatterbox-tts, or torchaudio. "
185+
"Install with: pip install chatterbox-tts torch torchaudio"
186+
}), 400
160187

161188
tts_config = TTSConfig(
162189
enabled=True,
163-
provider='edge-tts',
190+
provider=provider,
164191
voice=data.get('tts_voice', ''),
165192
rate=data.get('tts_rate', '+0%'),
166193
volume=data.get('tts_volume', '+0%'),
167194
pitch=data.get('tts_pitch', '+0Hz'),
168195
output_format=data.get('tts_format', 'opus'),
169196
bitrate=data.get('tts_bitrate', '64k'),
170-
target_language=target_language
197+
target_language=target_language,
198+
# Chatterbox-specific settings
199+
voice_prompt_path=data.get('tts_voice_prompt_path', ''),
200+
exaggeration=float(data.get('tts_exaggeration', 0.5)),
201+
cfg_weight=float(data.get('tts_cfg_weight', 0.5)),
171202
)
172203

173204
# Generate job ID
@@ -208,9 +239,7 @@ def get_tts_status(job_id):
208239

209240
@bp.route('/api/tts/voices', methods=['GET'])
210241
def list_voices():
211-
"""List available TTS voices by language"""
212-
from src.tts.tts_config import DEFAULT_VOICES
213-
242+
"""List available TTS voices by language for Edge-TTS (default)"""
214243
# Group voices by language
215244
voices_by_language = {}
216245
for key, voice in DEFAULT_VOICES.items():
@@ -223,4 +252,212 @@ def list_voices():
223252
"default_provider": "edge-tts"
224253
})
225254

255+
@bp.route('/api/tts/voices/chatterbox', methods=['GET'])
256+
def list_chatterbox_voices():
257+
"""
258+
List available languages for Chatterbox TTS.
259+
260+
Chatterbox supports 23 languages. Voice is determined by the
261+
voice prompt audio file (voice cloning) or uses default model voice.
262+
263+
Returns:
264+
JSON with supported languages and availability status
265+
"""
266+
available = is_chatterbox_available()
267+
268+
return jsonify({
269+
"available": available,
270+
"provider": "chatterbox",
271+
"languages": CHATTERBOX_LANGUAGES,
272+
"language_count": len(CHATTERBOX_LANGUAGES),
273+
"features": {
274+
"voice_cloning": True,
275+
"emotion_control": True,
276+
"gpu_acceleration": True,
277+
},
278+
"note": "Voice is determined by uploaded voice prompt or uses default model voice"
279+
})
280+
281+
@bp.route('/api/tts/providers', methods=['GET'])
282+
def list_providers():
283+
"""
284+
List available TTS providers and their status.
285+
286+
Returns:
287+
JSON with provider information and availability
288+
"""
289+
providers = {
290+
"edge-tts": {
291+
"name": "Edge TTS",
292+
"description": "Microsoft Edge neural voices (cloud-based)",
293+
"available": True, # Always available (uses HTTP API)
294+
"features": {
295+
"voice_selection": True,
296+
"rate_control": True,
297+
"volume_control": True,
298+
"pitch_control": True,
299+
"voice_cloning": False,
300+
"gpu_required": False,
301+
},
302+
"language_count": len([k for k in DEFAULT_VOICES.keys() if len(k) > 2 and '-' not in k]),
303+
},
304+
"chatterbox": {
305+
"name": "Chatterbox TTS",
306+
"description": "Local GPU-accelerated TTS with voice cloning",
307+
"available": is_chatterbox_available(),
308+
"features": {
309+
"voice_selection": False, # Voice determined by audio prompt
310+
"rate_control": False,
311+
"volume_control": False,
312+
"pitch_control": False,
313+
"voice_cloning": True,
314+
"emotion_control": True,
315+
"gpu_required": True,
316+
},
317+
"language_count": len(CHATTERBOX_LANGUAGES),
318+
}
319+
}
320+
321+
return jsonify({
322+
"providers": providers,
323+
"default": "edge-tts"
324+
})
325+
326+
@bp.route('/api/tts/gpu-status', methods=['GET'])
327+
def gpu_status():
328+
"""
329+
Get GPU status for Chatterbox TTS.
330+
331+
Returns:
332+
JSON with GPU availability, name, and VRAM information
333+
"""
334+
status = get_gpu_status()
335+
status["chatterbox_ready"] = is_chatterbox_available() and status.get("cuda_available", False)
336+
337+
return jsonify(status)
338+
339+
@bp.route('/api/tts/voice-prompt/upload', methods=['POST'])
340+
def upload_voice_prompt():
341+
"""
342+
Upload an audio file for voice cloning with Chatterbox TTS.
343+
344+
The uploaded file will be saved to the output directory and can
345+
be referenced in TTS generation requests.
346+
347+
Form data:
348+
file: Audio file (WAV, MP3, FLAC, OGG, M4A)
349+
350+
Returns:
351+
JSON with the path to the saved voice prompt
352+
"""
353+
if 'file' not in request.files:
354+
return jsonify({"error": "No file provided"}), 400
355+
356+
file = request.files['file']
357+
if file.filename == '':
358+
return jsonify({"error": "No file selected"}), 400
359+
360+
# Validate file extension
361+
filename = secure_filename(file.filename)
362+
ext = os.path.splitext(filename)[1].lower()
363+
364+
if ext not in ALLOWED_VOICE_PROMPT_EXTENSIONS:
365+
return jsonify({
366+
"error": f"Invalid file type: {ext}",
367+
"allowed": list(ALLOWED_VOICE_PROMPT_EXTENSIONS)
368+
}), 400
369+
370+
# Create voice_prompts directory if it doesn't exist
371+
voice_prompts_dir = os.path.join(output_dir, 'voice_prompts')
372+
os.makedirs(voice_prompts_dir, exist_ok=True)
373+
374+
# Generate unique filename to avoid conflicts
375+
unique_filename = f"{uuid.uuid4().hex[:8]}_{filename}"
376+
save_path = os.path.join(voice_prompts_dir, unique_filename)
377+
378+
try:
379+
file.save(save_path)
380+
logger.info(f"Voice prompt saved: {save_path}")
381+
382+
return jsonify({
383+
"success": True,
384+
"filename": unique_filename,
385+
"path": save_path,
386+
"message": f"Voice prompt uploaded successfully"
387+
})
388+
389+
except Exception as e:
390+
logger.error(f"Failed to save voice prompt: {e}")
391+
return jsonify({
392+
"error": "Failed to save voice prompt",
393+
"details": str(e)
394+
}), 500
395+
396+
@bp.route('/api/tts/voice-prompts', methods=['GET'])
397+
def list_voice_prompts():
398+
"""
399+
List available voice prompt files for voice cloning.
400+
401+
Returns:
402+
JSON with list of available voice prompt files
403+
"""
404+
voice_prompts_dir = os.path.join(output_dir, 'voice_prompts')
405+
406+
if not os.path.exists(voice_prompts_dir):
407+
return jsonify({
408+
"voice_prompts": [],
409+
"directory": voice_prompts_dir
410+
})
411+
412+
prompts = []
413+
for filename in os.listdir(voice_prompts_dir):
414+
ext = os.path.splitext(filename)[1].lower()
415+
if ext in ALLOWED_VOICE_PROMPT_EXTENSIONS:
416+
filepath = os.path.join(voice_prompts_dir, filename)
417+
prompts.append({
418+
"filename": filename,
419+
"path": filepath,
420+
"size_bytes": os.path.getsize(filepath),
421+
"extension": ext
422+
})
423+
424+
return jsonify({
425+
"voice_prompts": prompts,
426+
"directory": voice_prompts_dir,
427+
"count": len(prompts)
428+
})
429+
430+
@bp.route('/api/tts/voice-prompt/<filename>', methods=['DELETE'])
431+
def delete_voice_prompt(filename):
432+
"""
433+
Delete a voice prompt file.
434+
435+
Args:
436+
filename: Name of the voice prompt file to delete
437+
438+
Returns:
439+
JSON with success status
440+
"""
441+
voice_prompts_dir = os.path.join(output_dir, 'voice_prompts')
442+
filepath = os.path.join(voice_prompts_dir, secure_filename(filename))
443+
444+
if not os.path.exists(filepath):
445+
return jsonify({"error": "Voice prompt not found"}), 404
446+
447+
try:
448+
os.remove(filepath)
449+
logger.info(f"Voice prompt deleted: {filepath}")
450+
451+
return jsonify({
452+
"success": True,
453+
"message": f"Voice prompt '{filename}' deleted"
454+
})
455+
456+
except Exception as e:
457+
logger.error(f"Failed to delete voice prompt: {e}")
458+
return jsonify({
459+
"error": "Failed to delete voice prompt",
460+
"details": str(e)
461+
}), 500
462+
226463
return bp

0 commit comments

Comments
 (0)