11"""
22TTS (Text-to-Speech) routes for generating audio from existing files
3+
4+ Supports multiple TTS providers:
5+ - edge-tts: Microsoft Edge neural voices (cloud-based)
6+ - chatterbox: ResembleAI's local GPU-accelerated TTS with voice cloning
37"""
48import os
59import asyncio
610import logging
711import threading
812import uuid
913from flask import Blueprint , request , jsonify , current_app
10-
11- from src .tts .tts_config import TTSConfig
14+ from werkzeug .utils import secure_filename
15+
16+ from src .tts .tts_config import TTSConfig , CHATTERBOX_VOICES , DEFAULT_VOICES
17+ from src .tts .providers import (
18+ is_chatterbox_available ,
19+ get_gpu_status ,
20+ CHATTERBOX_LANGUAGES ,
21+ )
1222from src .utils .file_utils import generate_tts_for_translation
1323from src .api .services import FileService
1424
1525logger = logging .getLogger (__name__ )
1626
27+ # Allowed audio extensions for voice prompt upload
28+ ALLOWED_VOICE_PROMPT_EXTENSIONS = {'.wav' , '.mp3' , '.flac' , '.ogg' , '.m4a' }
29+
1730
1831def create_tts_blueprint (output_dir , socketio ):
1932 """
@@ -133,10 +146,15 @@ def generate_tts():
133146 {
134147 "filename": "translated_book.epub",
135148 "target_language": "Chinese",
149+ "tts_provider": "edge-tts", // or "chatterbox"
136150 "tts_voice": "", // Optional, auto-select if empty
137151 "tts_rate": "+0%",
138152 "tts_format": "opus",
139- "tts_bitrate": "64k"
153+ "tts_bitrate": "64k",
154+ // Chatterbox-specific options:
155+ "tts_voice_prompt_path": "", // Path to audio file for voice cloning
156+ "tts_exaggeration": 0.5, // Emotion level (0.0-1.0)
157+ "tts_cfg_weight": 0.5 // Classifier-free guidance weight
140158 }
141159 """
142160 try :
@@ -157,17 +175,30 @@ def generate_tts():
157175
158176 # Get TTS configuration from request
159177 target_language = data .get ('target_language' , 'English' )
178+ provider = data .get ('tts_provider' , 'edge-tts' )
179+
180+ # Validate provider choice
181+ if provider == 'chatterbox' and not is_chatterbox_available ():
182+ return jsonify ({
183+ "error" : "Chatterbox TTS is not available" ,
184+ "details" : "Missing dependencies: torch, chatterbox-tts, or torchaudio. "
185+ "Install with: pip install chatterbox-tts torch torchaudio"
186+ }), 400
160187
161188 tts_config = TTSConfig (
162189 enabled = True ,
163- provider = 'edge-tts' ,
190+ provider = provider ,
164191 voice = data .get ('tts_voice' , '' ),
165192 rate = data .get ('tts_rate' , '+0%' ),
166193 volume = data .get ('tts_volume' , '+0%' ),
167194 pitch = data .get ('tts_pitch' , '+0Hz' ),
168195 output_format = data .get ('tts_format' , 'opus' ),
169196 bitrate = data .get ('tts_bitrate' , '64k' ),
170- target_language = target_language
197+ target_language = target_language ,
198+ # Chatterbox-specific settings
199+ voice_prompt_path = data .get ('tts_voice_prompt_path' , '' ),
200+ exaggeration = float (data .get ('tts_exaggeration' , 0.5 )),
201+ cfg_weight = float (data .get ('tts_cfg_weight' , 0.5 )),
171202 )
172203
173204 # Generate job ID
@@ -208,9 +239,7 @@ def get_tts_status(job_id):
208239
209240 @bp .route ('/api/tts/voices' , methods = ['GET' ])
210241 def list_voices ():
211- """List available TTS voices by language"""
212- from src .tts .tts_config import DEFAULT_VOICES
213-
242+ """List available TTS voices by language for Edge-TTS (default)"""
214243 # Group voices by language
215244 voices_by_language = {}
216245 for key , voice in DEFAULT_VOICES .items ():
@@ -223,4 +252,212 @@ def list_voices():
223252 "default_provider" : "edge-tts"
224253 })
225254
255+ @bp .route ('/api/tts/voices/chatterbox' , methods = ['GET' ])
256+ def list_chatterbox_voices ():
257+ """
258+ List available languages for Chatterbox TTS.
259+
260+ Chatterbox supports 23 languages. Voice is determined by the
261+ voice prompt audio file (voice cloning) or uses default model voice.
262+
263+ Returns:
264+ JSON with supported languages and availability status
265+ """
266+ available = is_chatterbox_available ()
267+
268+ return jsonify ({
269+ "available" : available ,
270+ "provider" : "chatterbox" ,
271+ "languages" : CHATTERBOX_LANGUAGES ,
272+ "language_count" : len (CHATTERBOX_LANGUAGES ),
273+ "features" : {
274+ "voice_cloning" : True ,
275+ "emotion_control" : True ,
276+ "gpu_acceleration" : True ,
277+ },
278+ "note" : "Voice is determined by uploaded voice prompt or uses default model voice"
279+ })
280+
281+ @bp .route ('/api/tts/providers' , methods = ['GET' ])
282+ def list_providers ():
283+ """
284+ List available TTS providers and their status.
285+
286+ Returns:
287+ JSON with provider information and availability
288+ """
289+ providers = {
290+ "edge-tts" : {
291+ "name" : "Edge TTS" ,
292+ "description" : "Microsoft Edge neural voices (cloud-based)" ,
293+ "available" : True , # Always available (uses HTTP API)
294+ "features" : {
295+ "voice_selection" : True ,
296+ "rate_control" : True ,
297+ "volume_control" : True ,
298+ "pitch_control" : True ,
299+ "voice_cloning" : False ,
300+ "gpu_required" : False ,
301+ },
302+ "language_count" : len ([k for k in DEFAULT_VOICES .keys () if len (k ) > 2 and '-' not in k ]),
303+ },
304+ "chatterbox" : {
305+ "name" : "Chatterbox TTS" ,
306+ "description" : "Local GPU-accelerated TTS with voice cloning" ,
307+ "available" : is_chatterbox_available (),
308+ "features" : {
309+ "voice_selection" : False , # Voice determined by audio prompt
310+ "rate_control" : False ,
311+ "volume_control" : False ,
312+ "pitch_control" : False ,
313+ "voice_cloning" : True ,
314+ "emotion_control" : True ,
315+ "gpu_required" : True ,
316+ },
317+ "language_count" : len (CHATTERBOX_LANGUAGES ),
318+ }
319+ }
320+
321+ return jsonify ({
322+ "providers" : providers ,
323+ "default" : "edge-tts"
324+ })
325+
326+ @bp .route ('/api/tts/gpu-status' , methods = ['GET' ])
327+ def gpu_status ():
328+ """
329+ Get GPU status for Chatterbox TTS.
330+
331+ Returns:
332+ JSON with GPU availability, name, and VRAM information
333+ """
334+ status = get_gpu_status ()
335+ status ["chatterbox_ready" ] = is_chatterbox_available () and status .get ("cuda_available" , False )
336+
337+ return jsonify (status )
338+
339+ @bp .route ('/api/tts/voice-prompt/upload' , methods = ['POST' ])
340+ def upload_voice_prompt ():
341+ """
342+ Upload an audio file for voice cloning with Chatterbox TTS.
343+
344+ The uploaded file will be saved to the output directory and can
345+ be referenced in TTS generation requests.
346+
347+ Form data:
348+ file: Audio file (WAV, MP3, FLAC, OGG, M4A)
349+
350+ Returns:
351+ JSON with the path to the saved voice prompt
352+ """
353+ if 'file' not in request .files :
354+ return jsonify ({"error" : "No file provided" }), 400
355+
356+ file = request .files ['file' ]
357+ if file .filename == '' :
358+ return jsonify ({"error" : "No file selected" }), 400
359+
360+ # Validate file extension
361+ filename = secure_filename (file .filename )
362+ ext = os .path .splitext (filename )[1 ].lower ()
363+
364+ if ext not in ALLOWED_VOICE_PROMPT_EXTENSIONS :
365+ return jsonify ({
366+ "error" : f"Invalid file type: { ext } " ,
367+ "allowed" : list (ALLOWED_VOICE_PROMPT_EXTENSIONS )
368+ }), 400
369+
370+ # Create voice_prompts directory if it doesn't exist
371+ voice_prompts_dir = os .path .join (output_dir , 'voice_prompts' )
372+ os .makedirs (voice_prompts_dir , exist_ok = True )
373+
374+ # Generate unique filename to avoid conflicts
375+ unique_filename = f"{ uuid .uuid4 ().hex [:8 ]} _{ filename } "
376+ save_path = os .path .join (voice_prompts_dir , unique_filename )
377+
378+ try :
379+ file .save (save_path )
380+ logger .info (f"Voice prompt saved: { save_path } " )
381+
382+ return jsonify ({
383+ "success" : True ,
384+ "filename" : unique_filename ,
385+ "path" : save_path ,
386+ "message" : f"Voice prompt uploaded successfully"
387+ })
388+
389+ except Exception as e :
390+ logger .error (f"Failed to save voice prompt: { e } " )
391+ return jsonify ({
392+ "error" : "Failed to save voice prompt" ,
393+ "details" : str (e )
394+ }), 500
395+
396+ @bp .route ('/api/tts/voice-prompts' , methods = ['GET' ])
397+ def list_voice_prompts ():
398+ """
399+ List available voice prompt files for voice cloning.
400+
401+ Returns:
402+ JSON with list of available voice prompt files
403+ """
404+ voice_prompts_dir = os .path .join (output_dir , 'voice_prompts' )
405+
406+ if not os .path .exists (voice_prompts_dir ):
407+ return jsonify ({
408+ "voice_prompts" : [],
409+ "directory" : voice_prompts_dir
410+ })
411+
412+ prompts = []
413+ for filename in os .listdir (voice_prompts_dir ):
414+ ext = os .path .splitext (filename )[1 ].lower ()
415+ if ext in ALLOWED_VOICE_PROMPT_EXTENSIONS :
416+ filepath = os .path .join (voice_prompts_dir , filename )
417+ prompts .append ({
418+ "filename" : filename ,
419+ "path" : filepath ,
420+ "size_bytes" : os .path .getsize (filepath ),
421+ "extension" : ext
422+ })
423+
424+ return jsonify ({
425+ "voice_prompts" : prompts ,
426+ "directory" : voice_prompts_dir ,
427+ "count" : len (prompts )
428+ })
429+
430+ @bp .route ('/api/tts/voice-prompt/<filename>' , methods = ['DELETE' ])
431+ def delete_voice_prompt (filename ):
432+ """
433+ Delete a voice prompt file.
434+
435+ Args:
436+ filename: Name of the voice prompt file to delete
437+
438+ Returns:
439+ JSON with success status
440+ """
441+ voice_prompts_dir = os .path .join (output_dir , 'voice_prompts' )
442+ filepath = os .path .join (voice_prompts_dir , secure_filename (filename ))
443+
444+ if not os .path .exists (filepath ):
445+ return jsonify ({"error" : "Voice prompt not found" }), 404
446+
447+ try :
448+ os .remove (filepath )
449+ logger .info (f"Voice prompt deleted: { filepath } " )
450+
451+ return jsonify ({
452+ "success" : True ,
453+ "message" : f"Voice prompt '{ filename } ' deleted"
454+ })
455+
456+ except Exception as e :
457+ logger .error (f"Failed to delete voice prompt: { e } " )
458+ return jsonify ({
459+ "error" : "Failed to delete voice prompt" ,
460+ "details" : str (e )
461+ }), 500
462+
226463 return bp
0 commit comments