diff --git a/.env.example b/.env.example index 49245c9..97b7bf9 100755 --- a/.env.example +++ b/.env.example @@ -93,6 +93,27 @@ LONG_TEXT_JOB_RETENTION_DAYS=7 # Maximum number of concurrent long text jobs (default: 3) LONG_TEXT_MAX_CONCURRENT_JOBS=3 +# Minimum characters for Long Text async API +LONG_TEXT_MIN_LENGTH=100 + +# ============================================================================= +# Pause Handling Configuration +# ============================================================================= + +# Enable automatic pauses at punctuation patterns (true/false) +ENABLE_PUNCTUATION_PAUSES=true + +# Pause durations in milliseconds for supported punctuation +ELLIPSIS_PAUSE_MS=600 +EM_DASH_PAUSE_MS=400 +EN_DASH_PAUSE_MS=350 +PARAGRAPH_PAUSE_MS=500 +LINE_BREAK_PAUSE_MS=250 + +# Clamp pause durations to avoid extreme values +MIN_PAUSE_MS=100 +MAX_PAUSE_MS=2000 + # ============================================================================= # Docker-specific Configuration # ============================================================================= @@ -146,4 +167,4 @@ ENABLE_MEMORY_MONITORING=true # For slower, more careful speech: # CFG_WEIGHT=0.8 -# TEMPERATURE=0.4 \ No newline at end of file +# TEMPERATURE=0.4 diff --git a/.env.example.docker b/.env.example.docker index 126cfad..f38bee4 100755 --- a/.env.example.docker +++ b/.env.example.docker @@ -79,6 +79,9 @@ LONG_TEXT_MAX_LENGTH=100000 # Chunk size for splitting long text (default: 2500 chars, must be < MAX_TOTAL_LENGTH) LONG_TEXT_CHUNK_SIZE=2500 +# Batch processing configuration for GPU utilization (tune based on GPU memory) +LONG_TEXT_BATCH_SIZE=6 # RTX 3090/4090: 4-6, A100: 8-12, H100: 12-16 + # Silence padding between chunks in milliseconds (default: 200ms) LONG_TEXT_SILENCE_PADDING_MS=200 @@ -88,6 +91,37 @@ LONG_TEXT_JOB_RETENTION_DAYS=7 # Maximum number of concurrent long text jobs (default: 3) LONG_TEXT_MAX_CONCURRENT_JOBS=3 +# Minimum characters for Long Text async API +LONG_TEXT_MIN_LENGTH=100 + +# Default chunking strategy and quality preset for long text processing +LONG_TEXT_CHUNKING_STRATEGY=sentence +LONG_TEXT_QUALITY_PRESET=balanced + +# Quality preset tuning parameters +QUALITY_FAST_CHUNK_SIZE=1500 +QUALITY_FAST_CFG_WEIGHT=0.3 +QUALITY_FAST_TEMPERATURE=0.6 + +QUALITY_BALANCED_CHUNK_SIZE=2500 +QUALITY_BALANCED_CFG_WEIGHT=0.5 +QUALITY_BALANCED_TEMPERATURE=0.8 + +QUALITY_HIGH_CHUNK_SIZE=2800 +QUALITY_HIGH_CFG_WEIGHT=0.7 +QUALITY_HIGH_TEMPERATURE=1.0 + +# Pause handling configuration +ENABLE_PUNCTUATION_PAUSES=true +ELLIPSIS_PAUSE_MS=800 +EM_DASH_PAUSE_MS=550 +EN_DASH_PAUSE_MS=375 +PERIOD_PAUSE_MS=500 +PARAGRAPH_PAUSE_MS=800 +LINE_BREAK_PAUSE_MS=350 +MIN_PAUSE_MS=200 +MAX_PAUSE_MS=2000 + # ============================================================================= # Docker Volume Configuration # ============================================================================= @@ -141,4 +175,4 @@ ENABLE_MEMORY_MONITORING=true # For slower, more careful speech: # CFG_WEIGHT=0.8 -# TEMPERATURE=0.4 \ No newline at end of file +# TEMPERATURE=0.4 diff --git a/.gitignore b/.gitignore index a7f4fa8..8ef98e1 100755 --- a/.gitignore +++ b/.gitignore @@ -99,6 +99,7 @@ coverage.xml test_* !tests/test_* +!unit_tests/test_* reference/ CLAUDE.md diff --git a/README.md b/README.md index ff65d82..4a250ea 100755 --- a/README.md +++ b/README.md @@ -123,18 +123,18 @@ cp .env.example.docker .env # Docker-specific paths, ready to use # Choose your deployment method: # API Only (default) -docker compose -f docker/docker-compose.yml up -d # Standard (pip-based) -docker compose -f docker/docker-compose.uv.yml up -d # uv-optimized (faster builds) -docker compose -f docker/docker-compose.gpu.yml up -d # Standard + GPU -docker compose -f docker/docker-compose.uv.gpu.yml up -d # uv + GPU (recommended for GPU users) -docker compose -f docker/docker-compose.cpu.yml up -d # CPU-only -docker compose -f docker/docker-compose.blackwell.yml up -d # Blackwell (50XX) NVIDIA GPUs +docker compose -p tts-api -f docker/docker-compose.yml up -d # Standard (pip-based) +docker compose -p tts-api -f docker/docker-compose.uv.yml up -d # uv-optimized (faster builds) +docker compose -p tts-api -f docker/docker-compose.gpu.yml up -d # Standard + GPU +docker compose -p tts-api -f docker/docker-compose.uv.gpu.yml up -d # uv + GPU (recommended for GPU users) +docker compose -p tts-api -f docker/docker-compose.cpu.yml up -d # CPU-only +docker compose -p tts-api -f docker/docker-compose.blackwell.yml up -d # Blackwell (50XX) NVIDIA GPUs # API + Frontend (add --profile frontend to any of the above) -docker compose -f docker/docker-compose.yml --profile frontend up -d # Standard + Frontend -docker compose -f docker/docker-compose.gpu.yml --profile frontend up -d # GPU + Frontend -docker compose -f docker/docker-compose.uv.gpu.yml --profile frontend up -d # uv + GPU + Frontend -docker compose -f docker/docker-compose.blackwell.yml --profile frontend up -d # (Blackwell) uv + GPU + Frontend +docker compose -p tts-api -f docker/docker-compose.yml --profile frontend up -d # Standard + Frontend +docker compose -p tts-api -f docker/docker-compose.gpu.yml --profile frontend up -d # GPU + Frontend +docker compose -p tts-api -f docker/docker-compose.uv.gpu.yml --profile frontend up -d # uv + GPU + Frontend +docker compose -p tts-api -f docker/docker-compose.blackwell.yml --profile frontend up -d # (Blackwell) uv + GPU + Frontend # Watch the logs as it initializes (the first use of TTS takes the longest) docker logs chatterbox-tts-api -f diff --git a/app/api/endpoints/config.py b/app/api/endpoints/config.py index ecf58d2..4e90140 100644 --- a/app/api/endpoints/config.py +++ b/app/api/endpoints/config.py @@ -50,7 +50,8 @@ async def get_config(): "cfg_weight": Config.CFG_WEIGHT, "temperature": Config.TEMPERATURE, "max_chunk_length": Config.MAX_CHUNK_LENGTH, - "max_total_length": Config.MAX_TOTAL_LENGTH + "max_total_length": Config.MAX_TOTAL_LENGTH, + "long_text_min_length": Config.get_long_text_min_length(), }, memory_management={ "memory_cleanup_interval": Config.MEMORY_CLEANUP_INTERVAL, diff --git a/app/api/endpoints/long_text.py b/app/api/endpoints/long_text.py index fa6ed90..aa5e05a 100644 --- a/app/api/endpoints/long_text.py +++ b/app/api/endpoints/long_text.py @@ -1,5 +1,5 @@ """ -Long text TTS endpoints for processing texts > 3000 characters +Long text TTS endpoints for processing texts that exceed the configured minimum length """ import asyncio @@ -30,6 +30,7 @@ from app.config import Config from app.core.long_text_jobs import get_job_manager from app.core.background_tasks import get_processor +from app.core.quality_presets import get_quality_preset from app.core.text_processing import validate_long_text_input, estimate_processing_time from app.core import add_route_aliases @@ -43,7 +44,7 @@ async def create_long_text_job(request: LongTextRequest): """ Submit a long text TTS job for background processing. - Text must be > 3000 characters to use this endpoint. + Text must exceed the configured minimum length to use this endpoint. For shorter texts, use /audio/speech instead. """ try: @@ -60,6 +61,17 @@ async def create_long_text_job(request: LongTextRequest): } ) + # Resolve quality and chunking configuration + preset_name = request.get_quality_preset() + preset_config = get_quality_preset(preset_name) + + cfg_weight = request.cfg_weight if request.cfg_weight is not None else preset_config["cfg_weight"] + temperature = request.temperature if request.temperature is not None else preset_config["temperature"] + chunk_size = request.get_chunk_size(preset_config) + silence_padding = request.get_silence_padding() + chunking_strategy = request.get_chunking_strategy() + pause_settings = request.resolve_pause_settings() + # Get job manager and processor job_manager = get_job_manager() processor = get_processor() @@ -70,16 +82,22 @@ async def create_long_text_job(request: LongTextRequest): voice=request.voice, output_format=request.response_format or "mp3", exaggeration=request.exaggeration, - cfg_weight=request.cfg_weight, - temperature=request.temperature, - session_id=request.session_id + cfg_weight=cfg_weight, + temperature=temperature, + session_id=request.session_id, + chunking_strategy=chunking_strategy, + chunk_size=chunk_size, + silence_padding=silence_padding, + quality_preset=preset_name, + enable_pauses=pause_settings["enable"], + custom_pauses=pause_settings["custom"], ) # Submit for background processing await processor.submit_job(job_id) # Estimate processing time - estimated_time = estimate_processing_time(len(request.input)) + estimated_time = estimate_processing_time(len(request.input), chunk_size=chunk_size) return LongTextJobCreateResponse( job_id=job_id, diff --git a/app/api/endpoints/speech.py b/app/api/endpoints/speech.py index aabb687..18a88c8 100644 --- a/app/api/endpoints/speech.py +++ b/app/api/endpoints/speech.py @@ -1,11 +1,10 @@ -""" -Text-to-speech endpoint -""" +"""Text-to-speech endpoint.""" import io import os import asyncio import tempfile +import logging import torch import torchaudio as ta import base64 @@ -22,6 +21,7 @@ split_text_into_chunks, concatenate_audio_chunks, add_route_aliases, TTSStatus, start_tts_request, update_tts_status, get_voice_library ) +from app.core.pause_handler import PauseHandler from app.core.tts_model import get_model, is_multilingual from app.core.text_processing import split_text_for_streaming, get_streaming_settings @@ -29,6 +29,8 @@ base_router = APIRouter() router = add_route_aliases(base_router) +logger = logging.getLogger(__name__) + # Request counter for memory management REQUEST_COUNTER = 0 @@ -147,14 +149,27 @@ async def generate_speech_internal( language_id: str = "en", exaggeration: Optional[float] = None, cfg_weight: Optional[float] = None, - temperature: Optional[float] = None + temperature: Optional[float] = None, + enable_pauses: Optional[bool] = None, + custom_pauses: Optional[Dict[str, int]] = None, ) -> io.BytesIO: - """Internal function to generate speech with given parameters""" + """Internal function to generate speech with given parameters.""" global REQUEST_COUNTER REQUEST_COUNTER += 1 # Start TTS request tracking voice_source = "uploaded file" if voice_sample_path != Config.VOICE_SAMPLE_PATH else "default" + resolved_enable_pauses = ( + Config.ENABLE_PUNCTUATION_PAUSES if enable_pauses is None else bool(enable_pauses) + ) + pause_overrides = {} + if custom_pauses: + for key, value in custom_pauses.items(): + try: + pause_overrides[str(key)] = int(value) + except (TypeError, ValueError): + logger.debug("Ignoring invalid custom pause override %r=%r", key, value) + request_id = start_tts_request( text=text, voice_source=voice_source, @@ -162,7 +177,9 @@ async def generate_speech_internal( "exaggeration": exaggeration, "cfg_weight": cfg_weight, "temperature": temperature, - "voice_sample_path": voice_sample_path + "voice_sample_path": voice_sample_path, + "enable_pauses": resolved_enable_pauses, + "custom_pauses": pause_overrides, } ) @@ -203,41 +220,106 @@ async def generate_speech_internal( } ) - audio_chunks = [] + audio_chunks: List[Any] = [] final_audio = None buffer = None - + assembled_segments: List[Any] = [] + silence_segments: List[Any] = [] + try: # Get parameters with defaults exaggeration = exaggeration if exaggeration is not None else Config.EXAGGERATION cfg_weight = cfg_weight if cfg_weight is not None else Config.CFG_WEIGHT temperature = temperature if temperature is not None else Config.TEMPERATURE - - # Split text into chunks - update_tts_status(request_id, TTSStatus.CHUNKING, "Splitting text into chunks") - chunks = split_text_into_chunks(text, Config.MAX_CHUNK_LENGTH) - + + # Prepare text segments (respect pause settings) + update_tts_status(request_id, TTSStatus.CHUNKING, "Preparing text segments") + + if resolved_enable_pauses: + pause_defaults = { + "...": Config.ELLIPSIS_PAUSE_MS, + "—": Config.EM_DASH_PAUSE_MS, + "–": Config.EN_DASH_PAUSE_MS, + r"\.": Config.PERIOD_PAUSE_MS, + "\n\n": Config.PARAGRAPH_PAUSE_MS, + "\n": Config.LINE_BREAK_PAUSE_MS, + } + pause_defaults.update(pause_overrides) + + pause_handler = PauseHandler( + enable_pauses=True, + custom_pauses=pause_defaults, + min_pause_ms=Config.MIN_PAUSE_MS, + max_pause_ms=Config.MAX_PAUSE_MS, + ) + + pause_chunks = pause_handler.process(text) + tts_segments: List[Dict[str, Any]] = [] + for pause_chunk in pause_chunks: + sub_chunks = split_text_into_chunks(pause_chunk.text, Config.MAX_CHUNK_LENGTH) + for idx, sub_chunk in enumerate(sub_chunks): + pause_after = pause_chunk.pause_after_ms if idx == len(sub_chunks) - 1 else 0 + if sub_chunk.strip(): + tts_segments.append({ + "text": sub_chunk, + "pause_after_ms": pause_after, + }) + else: + raw_chunks = split_text_into_chunks(text, Config.MAX_CHUNK_LENGTH) + tts_segments = [ + {"text": chunk, "pause_after_ms": 0} + for chunk in raw_chunks + if chunk.strip() + ] + + if not tts_segments: + update_tts_status(request_id, TTSStatus.ERROR, "No text segments available for generation") + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail={ + "error": { + "message": "No valid text segments found after processing pauses.", + "type": "invalid_request_error", + } + }, + ) + voice_source = "uploaded file" if voice_sample_path != Config.VOICE_SAMPLE_PATH else "configured sample" - print(f"Processing {len(chunks)} text chunks with {voice_source} and parameters:") + print(f"Processing {len(tts_segments)} text segments with {voice_source} and parameters:") print(f" - Exaggeration: {exaggeration}") print(f" - CFG Weight: {cfg_weight}") print(f" - Temperature: {temperature}") - + # Update status with chunk information - update_tts_status(request_id, TTSStatus.GENERATING_AUDIO, "Starting audio generation", - current_chunk=0, total_chunks=len(chunks)) - + update_tts_status( + request_id, + TTSStatus.GENERATING_AUDIO, + "Starting audio generation", + current_chunk=0, + total_chunks=len(tts_segments), + ) + # Generate audio for each chunk with memory management loop = asyncio.get_event_loop() - - for i, chunk in enumerate(chunks): + + channels = None + dtype = None + + for i, segment in enumerate(tts_segments): + chunk = segment["text"] + pause_after_ms = int(segment["pause_after_ms"]) # Update progress - current_step = f"Generating audio for chunk {i+1}/{len(chunks)}" - update_tts_status(request_id, TTSStatus.GENERATING_AUDIO, current_step, - current_chunk=i+1, total_chunks=len(chunks)) - - print(f"Generating audio for chunk {i+1}/{len(chunks)}: '{chunk[:50]}{'...' if len(chunk) > 50 else ''}'") - + current_step = f"Generating audio for chunk {i+1}/{len(tts_segments)}" + update_tts_status( + request_id, + TTSStatus.GENERATING_AUDIO, + current_step, + current_chunk=i + 1, + total_chunks=len(tts_segments), + ) + + print(f"Generating audio for chunk {i+1}/{len(tts_segments)}: '{chunk[:50]}{'...' if len(chunk) > 50 else ''}'") + # Use torch.no_grad() to prevent gradient accumulation with torch.no_grad(): # Run TTS generation in executor to avoid blocking @@ -263,8 +345,24 @@ async def generate_speech_internal( if hasattr(audio_tensor, 'detach'): audio_tensor = audio_tensor.detach() + if audio_tensor.dim() == 1: + audio_tensor = audio_tensor.unsqueeze(0) + audio_chunks.append(audio_tensor) - + assembled_segments.append(audio_tensor) + + if channels is None: + channels = audio_tensor.shape[0] + if dtype is None: + dtype = audio_tensor.dtype + + if pause_after_ms > 0 and channels is not None and dtype is not None: + silence_samples = max(0, int(round((pause_after_ms / 1000.0) * model.sr))) + if silence_samples > 0: + silence_tensor = torch.zeros((channels, silence_samples), dtype=dtype, device=audio_tensor.device) + assembled_segments.append(silence_tensor) + silence_segments.append(silence_tensor) + # Periodic memory cleanup during generation if i > 0 and i % 3 == 0: # Every 3 chunks import gc @@ -273,13 +371,18 @@ async def generate_speech_internal( torch.cuda.empty_cache() # Concatenate all chunks with memory management - if len(audio_chunks) > 1: + if len(assembled_segments) == 1: + final_audio = assembled_segments[0] + else: update_tts_status(request_id, TTSStatus.CONCATENATING, "Concatenating audio chunks") print("Concatenating audio chunks...") with torch.no_grad(): - final_audio = concatenate_audio_chunks(audio_chunks, model.sr) - else: - final_audio = audio_chunks[0] + if resolved_enable_pauses: + final_audio = assembled_segments[0] + for segment in assembled_segments[1:]: + final_audio = torch.cat([final_audio, segment.to(final_audio.device)], dim=1) + else: + final_audio = concatenate_audio_chunks(audio_chunks, model.sr) # Convert to WAV format update_tts_status(request_id, TTSStatus.FINALIZING, "Converting to WAV format") @@ -320,16 +423,21 @@ async def generate_speech_internal( # Clean up all audio chunks for chunk in audio_chunks: safe_delete_tensors(chunk) - + + for silence in silence_segments: + safe_delete_tensors(silence) + # Clean up final audio tensor if final_audio is not None: safe_delete_tensors(final_audio) if 'final_audio_cpu' in locals(): safe_delete_tensors(final_audio_cpu) - + # Clear the list audio_chunks.clear() - + assembled_segments.clear() + silence_segments.clear() + # Periodic memory cleanup if REQUEST_COUNTER % Config.MEMORY_CLEANUP_INTERVAL == 0: cleanup_memory() @@ -796,7 +904,11 @@ async def text_to_speech(request: TTSRequest): # Resolve voice name to file path and language voice_sample_path, language_id = resolve_voice_path_and_language(request.voice) - + + enable_pauses = request.enable_pauses + if enable_pauses is None: + enable_pauses = Config.ENABLE_PUNCTUATION_PAUSES + # Check if SSE streaming is requested if request.stream_format == "sse": # Return SSE streaming response @@ -827,7 +939,9 @@ async def text_to_speech(request: TTSRequest): language_id=language_id, exaggeration=request.exaggeration, cfg_weight=request.cfg_weight, - temperature=request.temperature + temperature=request.temperature, + enable_pauses=enable_pauses, + custom_pauses=request.custom_pauses, ) # Create response diff --git a/app/config.py b/app/config.py index 5b9fc87..c9a9c51 100644 --- a/app/config.py +++ b/app/config.py @@ -12,7 +12,10 @@ class Config: """Application configuration class""" - + + _DEFAULT_LONG_TEXT_MIN_LENGTH = 3000 + _DEFAULT_LONG_TEXT_MAX_LENGTH = 100000 + # Server settings HOST = os.getenv('HOST', '0.0.0.0') PORT = int(os.getenv('PORT', 4123)) @@ -36,11 +39,44 @@ class Config: # Long text processing settings LONG_TEXT_DATA_DIR = os.getenv('LONG_TEXT_DATA_DIR', './data/long_text_jobs') - LONG_TEXT_MAX_LENGTH = int(os.getenv('LONG_TEXT_MAX_LENGTH', 100000)) + LONG_TEXT_MIN_LENGTH = int(os.getenv('LONG_TEXT_MIN_LENGTH', _DEFAULT_LONG_TEXT_MIN_LENGTH)) + LONG_TEXT_MAX_LENGTH = int(os.getenv('LONG_TEXT_MAX_LENGTH', _DEFAULT_LONG_TEXT_MAX_LENGTH)) LONG_TEXT_CHUNK_SIZE = int(os.getenv('LONG_TEXT_CHUNK_SIZE', 2500)) + LONG_TEXT_BATCH_SIZE = int(os.getenv('LONG_TEXT_BATCH_SIZE', 4)) LONG_TEXT_SILENCE_PADDING_MS = int(os.getenv('LONG_TEXT_SILENCE_PADDING_MS', 200)) LONG_TEXT_JOB_RETENTION_DAYS = int(os.getenv('LONG_TEXT_JOB_RETENTION_DAYS', 7)) LONG_TEXT_MAX_CONCURRENT_JOBS = int(os.getenv('LONG_TEXT_MAX_CONCURRENT_JOBS', 3)) + LONG_TEXT_CHUNKING_STRATEGY = os.getenv('LONG_TEXT_CHUNKING_STRATEGY', 'sentence') + LONG_TEXT_QUALITY_PRESET = os.getenv('LONG_TEXT_QUALITY_PRESET', 'balanced') + + QUALITY_PRESETS = { + "fast": { + "chunk_size": int(os.getenv('QUALITY_FAST_CHUNK_SIZE', '1500')), + "cfg_weight": float(os.getenv('QUALITY_FAST_CFG_WEIGHT', '0.3')), + "temperature": float(os.getenv('QUALITY_FAST_TEMPERATURE', '0.6')), + }, + "balanced": { + "chunk_size": int(os.getenv('QUALITY_BALANCED_CHUNK_SIZE', '2500')), + "cfg_weight": float(os.getenv('QUALITY_BALANCED_CFG_WEIGHT', '0.5')), + "temperature": float(os.getenv('QUALITY_BALANCED_TEMPERATURE', '0.8')), + }, + "high": { + "chunk_size": int(os.getenv('QUALITY_HIGH_CHUNK_SIZE', '2800')), + "cfg_weight": float(os.getenv('QUALITY_HIGH_CFG_WEIGHT', '0.7')), + "temperature": float(os.getenv('QUALITY_HIGH_TEMPERATURE', '1.0')), + }, + } + + # Pause handling configuration + ENABLE_PUNCTUATION_PAUSES = os.getenv('ENABLE_PUNCTUATION_PAUSES', 'true').lower() == 'true' + ELLIPSIS_PAUSE_MS = int(os.getenv('ELLIPSIS_PAUSE_MS', 800)) + EM_DASH_PAUSE_MS = int(os.getenv('EM_DASH_PAUSE_MS', 550)) + EN_DASH_PAUSE_MS = int(os.getenv('EN_DASH_PAUSE_MS', 375)) + PERIOD_PAUSE_MS = int(os.getenv('PERIOD_PAUSE_MS', 500)) + PARAGRAPH_PAUSE_MS = int(os.getenv('PARAGRAPH_PAUSE_MS', 800)) + LINE_BREAK_PAUSE_MS = int(os.getenv('LINE_BREAK_PAUSE_MS', 350)) + MIN_PAUSE_MS = int(os.getenv('MIN_PAUSE_MS', 200)) + MAX_PAUSE_MS = int(os.getenv('MAX_PAUSE_MS', 2000)) # Multilingual model settings USE_MULTILINGUAL_MODEL = os.getenv('USE_MULTILINGUAL_MODEL', 'true').lower() == 'true' @@ -56,6 +92,9 @@ class Config: @classmethod def validate(cls): """Validate configuration values""" + min_length = cls.get_long_text_min_length() + max_length = cls.get_long_text_max_length() + if not (0.25 <= cls.EXAGGERATION <= 2.0): raise ValueError(f"EXAGGERATION must be between 0.25 and 2.0, got {cls.EXAGGERATION}") if not (0.0 <= cls.CFG_WEIGHT <= 1.0): @@ -70,8 +109,14 @@ def validate(cls): raise ValueError(f"MEMORY_CLEANUP_INTERVAL must be positive, got {cls.MEMORY_CLEANUP_INTERVAL}") if cls.CUDA_CACHE_CLEAR_INTERVAL <= 0: raise ValueError(f"CUDA_CACHE_CLEAR_INTERVAL must be positive, got {cls.CUDA_CACHE_CLEAR_INTERVAL}") - if cls.LONG_TEXT_MAX_LENGTH <= cls.MAX_TOTAL_LENGTH: - raise ValueError(f"LONG_TEXT_MAX_LENGTH ({cls.LONG_TEXT_MAX_LENGTH}) must be greater than MAX_TOTAL_LENGTH ({cls.MAX_TOTAL_LENGTH})") + if min_length <= 0: + raise ValueError(f"LONG_TEXT_MIN_LENGTH must be positive, got {min_length}") + if max_length <= min_length: + raise ValueError( + "LONG_TEXT_MAX_LENGTH ({}) must be greater than LONG_TEXT_MIN_LENGTH ({})".format( + max_length, min_length + ) + ) if cls.LONG_TEXT_CHUNK_SIZE <= 0: raise ValueError(f"LONG_TEXT_CHUNK_SIZE must be positive, got {cls.LONG_TEXT_CHUNK_SIZE}") if cls.LONG_TEXT_CHUNK_SIZE >= cls.MAX_TOTAL_LENGTH: @@ -82,6 +127,44 @@ def validate(cls): raise ValueError(f"LONG_TEXT_JOB_RETENTION_DAYS must be positive, got {cls.LONG_TEXT_JOB_RETENTION_DAYS}") if cls.LONG_TEXT_MAX_CONCURRENT_JOBS <= 0: raise ValueError(f"LONG_TEXT_MAX_CONCURRENT_JOBS must be positive, got {cls.LONG_TEXT_MAX_CONCURRENT_JOBS}") + if cls.MIN_PAUSE_MS < 0: + raise ValueError(f"MIN_PAUSE_MS must be non-negative, got {cls.MIN_PAUSE_MS}") + if cls.MAX_PAUSE_MS < cls.MIN_PAUSE_MS: + raise ValueError( + f"MAX_PAUSE_MS ({cls.MAX_PAUSE_MS}) must be greater than or equal to MIN_PAUSE_MS ({cls.MIN_PAUSE_MS})" + ) + + @staticmethod + def _get_int_env(name: str, fallback: int) -> int: + value = os.getenv(name) + if value is None or value == "": + return fallback + + try: + return int(value) + except ValueError as exc: + raise ValueError(f"{name} must be an integer, got {value!r}") from exc + + @classmethod + def refresh_long_text_limits(cls) -> None: + cls.LONG_TEXT_MIN_LENGTH = cls._get_int_env( + "LONG_TEXT_MIN_LENGTH", + cls._DEFAULT_LONG_TEXT_MIN_LENGTH, + ) + cls.LONG_TEXT_MAX_LENGTH = cls._get_int_env( + "LONG_TEXT_MAX_LENGTH", + cls._DEFAULT_LONG_TEXT_MAX_LENGTH, + ) + + @classmethod + def get_long_text_min_length(cls) -> int: + cls.refresh_long_text_limits() + return cls.LONG_TEXT_MIN_LENGTH + + @classmethod + def get_long_text_max_length(cls) -> int: + cls.refresh_long_text_limits() + return cls.LONG_TEXT_MAX_LENGTH def detect_device(): diff --git a/app/core/background_tasks.py b/app/core/background_tasks.py index 1ceca8c..b22d729 100644 --- a/app/core/background_tasks.py +++ b/app/core/background_tasks.py @@ -8,11 +8,11 @@ import traceback from datetime import datetime from pathlib import Path -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List, Tuple from app.config import Config from app.core.long_text_jobs import get_job_manager -from app.core.text_processing import split_text_for_long_generation, estimate_processing_time +from app.core.text_processing import split_text_for_long_generation from app.core.audio_processing import concatenate_audio_files, AudioConcatenationError from app.api.endpoints.speech import generate_speech_internal, resolve_voice_path_and_language from app.models.long_text import ( @@ -120,7 +120,7 @@ def _cleanup_task(self, job_id: str): del self.active_tasks[job_id] async def _process_job(self, job_id: str): - """Process a single long text job""" + """Process a single long text job with batched GPU inference""" logger.info(f"Starting processing for job {job_id}") try: @@ -141,12 +141,22 @@ async def _process_job(self, job_id: str): await self._fail_job(job_id, "Input text not found") return + parameters = metadata.parameters or {} + chunk_size = int(parameters.get('chunk_size', Config.LONG_TEXT_CHUNK_SIZE)) + if chunk_size <= 0: + chunk_size = Config.LONG_TEXT_CHUNK_SIZE + + chunking_strategy = parameters.get( + 'chunking_strategy', Config.LONG_TEXT_CHUNKING_STRATEGY + ) + # Phase 1: Text chunking await self._update_job_status(job_id, LongTextJobStatus.CHUNKING, "Splitting text into chunks") chunks = split_text_for_long_generation( input_text, - max_chunk_size=Config.LONG_TEXT_CHUNK_SIZE + max_chunk_size=chunk_size, + strategy=chunking_strategy ) if not chunks: @@ -160,82 +170,89 @@ async def _process_job(self, job_id: str): logger.info(f"Job {job_id}: Split into {len(chunks)} chunks") - # Phase 2: Generate audio for each chunk - await self._update_job_status(job_id, LongTextJobStatus.PROCESSING, f"Generating audio for {len(chunks)} chunks") + # Phase 2: Generate audio for all chunks with batching + await self._update_job_status( + job_id, + LongTextJobStatus.PROCESSING, + f"Generating audio for {len(chunks)} chunks", + ) voice_path, language_id = resolve_voice_path_and_language(metadata.voice) - chunk_audio_files = [] - for i, chunk in enumerate(chunks): - # Check if job was paused or cancelled + batch_size = int(parameters.get('batch_size', Config.LONG_TEXT_BATCH_SIZE)) + if batch_size <= 0: + batch_size = Config.LONG_TEXT_BATCH_SIZE + + chunk_audio_data: List[Tuple[int, Any, LongTextChunk]] = [] + + for batch_start in range(0, len(chunks), batch_size): current_metadata = self.job_manager._load_job_metadata(job_id) if current_metadata and current_metadata.status in [LongTextJobStatus.PAUSED, LongTextJobStatus.CANCELLED]: logger.info(f"Job {job_id} was paused/cancelled, stopping processing") return - # Update current chunk - current_metadata.current_chunk = i - self.job_manager._save_job_metadata(current_metadata) - - # Update chunk status - chunk.processing_started_at = datetime.utcnow() - chunks[i] = chunk # Update in list + batch_end = min(batch_start + batch_size, len(chunks)) + batch_chunks = chunks[batch_start:batch_end] - logger.info(f"Job {job_id}: Processing chunk {i+1}/{len(chunks)} ({len(chunk.text)} chars)") + logger.info( + f"Job {job_id}: Processing batch {batch_start // batch_size + 1} " + f"(chunks {batch_start + 1}-{batch_end}/{len(chunks)})" + ) - try: - # Generate audio for this chunk - audio_buffer = await generate_speech_internal( - text=chunk.text, - voice_sample_path=voice_path, - language_id=language_id, - exaggeration=metadata.parameters.get('exaggeration'), - cfg_weight=metadata.parameters.get('cfg_weight'), - temperature=metadata.parameters.get('temperature') + batch_tasks = [] + for i, chunk in enumerate(batch_chunks, start=batch_start): + batch_tasks.append( + self._generate_chunk_audio( + job_id=job_id, + chunk=chunk, + chunk_index=i, + voice_path=voice_path, + language_id=language_id, + parameters=parameters, + ) ) - # Save chunk audio file - chunk_filename = f"chunk_{i+1:03d}.wav" - chunk_audio_path = self.job_manager._get_job_file_paths(job_id)['chunks_dir'] / chunk_filename - - with open(chunk_audio_path, 'wb') as f: - f.write(audio_buffer.getvalue()) + batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True) - # Update chunk metadata - chunk.audio_file = chunk_filename - chunk.processing_completed_at = datetime.utcnow() - chunk.duration_ms = int((chunk.processing_completed_at - chunk.processing_started_at).total_seconds() * 1000) - - chunk_audio_files.append(chunk_audio_path) - chunks[i] = chunk - - # Update job progress - current_metadata.completed_chunks = i + 1 - self.job_manager._save_job_metadata(current_metadata) - self.job_manager._save_chunks_data(job_id, chunks) - - logger.info(f"Job {job_id}: Completed chunk {i+1}/{len(chunks)}") + current_metadata = self.job_manager._load_job_metadata(job_id) + if not current_metadata: + current_metadata = metadata + + for i, result in enumerate(batch_results, start=batch_start): + chunk = chunks[i] + if isinstance(result, Exception): + logger.error(f"Job {job_id}: Failed to process chunk {i + 1}: {result}") + chunk.error = str(result) + chunks[i] = chunk + if i not in current_metadata.failed_chunks: + current_metadata.failed_chunks.append(i) + else: + audio_buffer, updated_chunk = result + chunk_audio_data.append((i, audio_buffer, updated_chunk)) + chunks[i] = updated_chunk + + completed_chunks = len([c for c in chunks if c.audio_file]) + current_metadata.completed_chunks = completed_chunks + current_metadata.current_chunk = min(batch_end, len(chunks)) - 1 + self.job_manager._save_job_metadata(current_metadata) + self.job_manager._save_chunks_data(job_id, chunks) - except Exception as e: - logger.error(f"Job {job_id}: Failed to process chunk {i+1}: {e}") - chunk.error = str(e) - chunks[i] = chunk + if not chunk_audio_data: + await self._fail_job(job_id, "No chunks were successfully generated") + return - # Mark chunk as failed - if i not in current_metadata.failed_chunks: - current_metadata.failed_chunks.append(i) - self.job_manager._save_job_metadata(current_metadata) + if len(chunk_audio_data) < len(chunks): + logger.warning( + f"Job {job_id}: Only {len(chunk_audio_data)}/{len(chunks)} chunks generated successfully" + ) - # For now, continue with other chunks (could be made configurable) - continue + logger.info(f"Job {job_id}: Writing {len(chunk_audio_data)} audio files to disk") + chunk_audio_files = await self._batch_write_audio_files(job_id, chunk_audio_data) - # Check if we have enough successful chunks to continue - successful_chunks = [f for f in chunk_audio_files if f.exists()] - if len(successful_chunks) == 0: + successful_chunks = [path for path in chunk_audio_files if path.exists()] + if not successful_chunks: await self._fail_job(job_id, "No chunks were successfully generated") return - elif len(successful_chunks) < len(chunks): - logger.warning(f"Job {job_id}: Only {len(successful_chunks)}/{len(chunks)} chunks generated successfully") # Phase 3: Concatenate audio chunks await self._update_job_status(job_id, LongTextJobStatus.PROCESSING, "Combining audio chunks") @@ -244,11 +261,17 @@ async def _process_job(self, job_id: str): output_filename = f"final.{metadata.output_format}" output_path = self.job_manager._get_job_file_paths(job_id)['output_dir'] / output_filename + silence_padding_ms = parameters.get( + 'silence_padding_ms', Config.LONG_TEXT_SILENCE_PADDING_MS + ) + if silence_padding_ms is None or silence_padding_ms < 0: + silence_padding_ms = Config.LONG_TEXT_SILENCE_PADDING_MS + concatenation_metadata = concatenate_audio_files( audio_files=successful_chunks, output_path=output_path, output_format=metadata.output_format, - silence_duration_ms=Config.LONG_TEXT_SILENCE_PADDING_MS, + silence_duration_ms=silence_padding_ms, # normalize_volume=True, normalize_volume=False, remove_source_files=False # Keep source chunks for debugging @@ -283,6 +306,83 @@ async def _process_job(self, job_id: str): logger.error(traceback.format_exc()) await self._fail_job(job_id, f"Unexpected error: {e}") + async def _generate_chunk_audio( + self, + job_id: str, + chunk: LongTextChunk, + chunk_index: int, + voice_path: str, + language_id: str, + parameters: Dict[str, Any], + ): + """Generate audio for a single chunk (executed in parallel within a batch)""" + + chunk.processing_started_at = datetime.utcnow() + chunk.error = None + + logger.debug( + f"Job {job_id}: Processing chunk {chunk_index + 1} ({len(chunk.text)} chars)" + ) + + try: + pause_settings = (parameters.get('pause_settings') or {}) if parameters else {} + + audio_buffer = await generate_speech_internal( + text=chunk.text, + voice_sample_path=voice_path, + language_id=language_id, + exaggeration=(parameters or {}).get('exaggeration'), + cfg_weight=(parameters or {}).get('cfg_weight'), + temperature=(parameters or {}).get('temperature'), + enable_pauses=pause_settings.get('enable'), + custom_pauses=pause_settings.get('custom'), + ) + + chunk.audio_file = f"chunk_{chunk_index + 1:03d}.wav" + chunk.processing_completed_at = datetime.utcnow() + chunk.duration_ms = int( + (chunk.processing_completed_at - chunk.processing_started_at).total_seconds() * 1000 + ) + + logger.debug( + f"Job {job_id}: Completed chunk {chunk_index + 1} in {chunk.duration_ms}ms" + ) + + return audio_buffer, chunk + + except Exception as exc: + logger.error(f"Job {job_id}: Error processing chunk {chunk_index + 1}: {exc}") + raise + + async def _batch_write_audio_files( + self, + job_id: str, + chunk_audio_data: List[Tuple[int, Any, LongTextChunk]], + ) -> List[Path]: + """Write generated audio buffers to disk after GPU work completes""" + + job_paths = self.job_manager._get_job_file_paths(job_id) + written_paths: List[Path] = [] + + for chunk_index, audio_buffer, chunk in chunk_audio_data: + chunk_audio_path = job_paths['chunks_dir'] / chunk.audio_file + + await asyncio.to_thread( + self._write_audio_file, + chunk_audio_path, + audio_buffer.getvalue(), + ) + + written_paths.append(chunk_audio_path) + + logger.info(f"Job {job_id}: Wrote {len(written_paths)} audio files to disk") + return written_paths + + def _write_audio_file(self, path: Path, data: bytes) -> None: + """Write binary audio data to disk""" + with open(path, 'wb') as file_obj: + file_obj.write(data) + async def _update_job_status(self, job_id: str, status: LongTextJobStatus, message: str = ""): """Update job status""" try: diff --git a/app/core/long_text_jobs.py b/app/core/long_text_jobs.py index 6d5fb62..2d92971 100644 --- a/app/core/long_text_jobs.py +++ b/app/core/long_text_jobs.py @@ -158,14 +158,22 @@ def _load_input_text(self, job_id: str) -> Optional[str]: logger.error(f"Failed to load input text for job {job_id}: {e}") return None - def create_job(self, - text: str, - voice: Optional[str] = None, - output_format: str = "mp3", - exaggeration: Optional[float] = None, - cfg_weight: Optional[float] = None, - temperature: Optional[float] = None, - session_id: Optional[str] = None) -> Tuple[str, int]: + def create_job( + self, + text: str, + voice: Optional[str] = None, + output_format: str = "mp3", + exaggeration: Optional[float] = None, + cfg_weight: Optional[float] = None, + temperature: Optional[float] = None, + session_id: Optional[str] = None, + chunking_strategy: Optional[str] = None, + chunk_size: Optional[int] = None, + silence_padding: Optional[int] = None, + quality_preset: Optional[str] = None, + enable_pauses: Optional[bool] = None, + custom_pauses: Optional[Dict[str, int]] = None, + ) -> Tuple[str, int]: """ Create a new long text job @@ -178,8 +186,36 @@ def create_job(self, # Calculate text hash for potential deduplication text_hash = self._generate_text_hash(text) + # Resolve chunking configuration + resolved_chunk_size = chunk_size or Config.LONG_TEXT_CHUNK_SIZE + if resolved_chunk_size <= 0: + resolved_chunk_size = Config.LONG_TEXT_CHUNK_SIZE + + resolved_chunking_strategy = chunking_strategy or Config.LONG_TEXT_CHUNKING_STRATEGY + resolved_silence_padding = ( + silence_padding + if silence_padding is not None and silence_padding >= 0 + else Config.LONG_TEXT_SILENCE_PADDING_MS + ) + resolved_quality_preset = quality_preset or Config.LONG_TEXT_QUALITY_PRESET + + resolved_enable_pauses = ( + Config.ENABLE_PUNCTUATION_PAUSES if enable_pauses is None else bool(enable_pauses) + ) + + resolved_custom_pauses = None + if custom_pauses: + resolved_custom_pauses = {} + for key, value in custom_pauses.items(): + try: + resolved_custom_pauses[str(key)] = int(value) + except (TypeError, ValueError): + logger.debug("Ignoring invalid custom pause value %r=%r", key, value) + if not resolved_custom_pauses: + resolved_custom_pauses = None + # Estimate number of chunks - estimated_chunks = max(1, (len(text) + Config.LONG_TEXT_CHUNK_SIZE - 1) // Config.LONG_TEXT_CHUNK_SIZE) + estimated_chunks = max(1, (len(text) + resolved_chunk_size - 1) // resolved_chunk_size) # Create job directories self._create_job_directories(job_id) @@ -203,7 +239,15 @@ def create_job(self, 'exaggeration': exaggeration, 'cfg_weight': cfg_weight, 'temperature': temperature, - 'output_format': output_format + 'output_format': output_format, + 'chunking_strategy': resolved_chunking_strategy, + 'chunk_size': resolved_chunk_size, + 'quality_preset': resolved_quality_preset, + 'silence_padding_ms': resolved_silence_padding, + 'pause_settings': { + 'enable': resolved_enable_pauses, + 'custom': resolved_custom_pauses, + }, }, output_format=output_format, user_session_id=session_id @@ -754,7 +798,13 @@ def retry_job(self, job_id: str, preserve_chunks: bool = True, exaggeration=parameters.get('exaggeration'), cfg_weight=parameters.get('cfg_weight'), temperature=parameters.get('temperature'), - session_id=original_metadata.user_session_id + session_id=original_metadata.user_session_id, + chunking_strategy=parameters.get('chunking_strategy'), + chunk_size=parameters.get('chunk_size'), + silence_padding=parameters.get('silence_padding_ms'), + quality_preset=parameters.get('quality_preset'), + enable_pauses=(parameters.get('pause_settings') or {}).get('enable'), + custom_pauses=(parameters.get('pause_settings') or {}).get('custom'), ) # Update metadata to link to original job diff --git a/app/core/pause_handler.py b/app/core/pause_handler.py new file mode 100644 index 0000000..54aaa7b --- /dev/null +++ b/app/core/pause_handler.py @@ -0,0 +1,172 @@ +"""Utility classes for punctuation-based pause handling.""" + +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from typing import Dict, List, Optional + +from app.config import Config + +logger = logging.getLogger(__name__) + + +@dataclass +class TextChunk: + """Representation of a processed text segment.""" + + text: str + pause_after_ms: int + original_separator: Optional[str] = None + + +class PauseHandler: + """Split text around punctuation and expose pause metadata.""" + + DEFAULT_PAUSES: Dict[str, int] = { + r"\.\.\.": Config.ELLIPSIS_PAUSE_MS, + r"—": Config.EM_DASH_PAUSE_MS, + r"–": Config.EN_DASH_PAUSE_MS, + r"\.": Config.PERIOD_PAUSE_MS, + r"\n\n": Config.PARAGRAPH_PAUSE_MS, + r"\n": Config.LINE_BREAK_PAUSE_MS, + } + + def __init__( + self, + enable_pauses: bool = True, + custom_pauses: Optional[Dict[str, int]] = None, + min_pause_ms: int = Config.MIN_PAUSE_MS, + max_pause_ms: int = Config.MAX_PAUSE_MS, + ) -> None: + self.enable_pauses = enable_pauses + self.min_pause_ms = min_pause_ms + self.max_pause_ms = max_pause_ms + + self.pause_patterns: Dict[str, int] = {} + for pattern, duration in self.DEFAULT_PAUSES.items(): + self.pause_patterns[self._normalize_pattern(pattern)] = int(duration) + + if custom_pauses: + for raw_pattern, duration in custom_pauses.items(): + try: + normalized = self._normalize_pattern(raw_pattern) + self.pause_patterns[normalized] = int(duration) + except (TypeError, ValueError) as exc: + logger.debug("Ignoring invalid custom pause %r: %s", raw_pattern, exc) + + logger.debug("PauseHandler initialised with %d patterns", len(self.pause_patterns)) + + def process(self, text: str) -> List[TextChunk]: + """Split text and annotate pauses.""" + + cleaned = text.strip() + if not self.enable_pauses or not cleaned: + return [TextChunk(text=cleaned, pause_after_ms=0)] if cleaned else [] + + matches: List[Dict[str, object]] = [] + for pattern, duration in self.pause_patterns.items(): + compiled = re.compile(pattern) + for match in compiled.finditer(text): + matches.append( + { + "start": match.start(), + "end": match.end(), + "separator": match.group(0), + "pause_ms": int(duration), + } + ) + + matches.sort(key=lambda m: (int(m["start"]), -(int(m["end"]) - int(m["start"])))) + + chunks: List[TextChunk] = [] + position = 0 + for match in matches: + start = int(match["start"]) + end = int(match["end"]) + if start < position: + continue + + chunk_text = text[position:start].strip() + if chunk_text: + pause_ms = self._clamp_pause(int(match["pause_ms"])) + chunks.append( + TextChunk( + text=chunk_text, + pause_after_ms=pause_ms, + original_separator=str(match["separator"]), + ) + ) + + position = end + + remaining = text[position:].strip() + if remaining: + chunks.append(TextChunk(text=remaining, pause_after_ms=0, original_separator=None)) + + logger.debug("Split text into %d pause-aware chunks", len(chunks)) + return chunks + + def estimate_total_pause_time(self, text: str) -> int: + """Estimate cumulative pause duration for ``text``.""" + + return sum(chunk.pause_after_ms for chunk in self.process(text)) + + def get_pause_summary(self, text: str) -> Dict[str, object]: + """Return statistics about pauses for ``text``.""" + + chunks = self.process(text) + pause_types: Dict[str, int] = {} + for chunk in chunks: + if chunk.pause_after_ms > 0: + separator = chunk.original_separator or "other" + pause_types[separator] = pause_types.get(separator, 0) + 1 + + return { + "total_chunks": len(chunks), + "total_pause_ms": sum(chunk.pause_after_ms for chunk in chunks), + "pause_types": pause_types, + "chunks_with_pauses": sum(1 for chunk in chunks if chunk.pause_after_ms > 0), + } + + def _clamp_pause(self, pause_ms: int) -> int: + return max(self.min_pause_ms, min(pause_ms, self.max_pause_ms)) + + @staticmethod + def _normalize_pattern(pattern: str) -> str: + if pattern is None: + raise ValueError("Pause pattern cannot be None") + if "\\" in pattern: + return pattern + return re.escape(pattern) + + +def split_text_with_pauses( + text: str, + enable_pauses: bool = True, + custom_pauses: Optional[Dict[str, int]] = None, + min_pause_ms: int = Config.MIN_PAUSE_MS, + max_pause_ms: int = Config.MAX_PAUSE_MS, +) -> List[TextChunk]: + """Convenience wrapper around :class:`PauseHandler`.""" + + pause_mapping: Dict[str, int] = { + r"\.\.\.": Config.ELLIPSIS_PAUSE_MS, + r"—": Config.EM_DASH_PAUSE_MS, + r"–": Config.EN_DASH_PAUSE_MS, + r"\.": Config.PERIOD_PAUSE_MS, + r"\n\n": Config.PARAGRAPH_PAUSE_MS, + r"\n": Config.LINE_BREAK_PAUSE_MS, + } + + if custom_pauses: + pause_mapping.update(custom_pauses) + + handler = PauseHandler( + enable_pauses=enable_pauses, + custom_pauses=pause_mapping, + min_pause_ms=min_pause_ms, + max_pause_ms=max_pause_ms, + ) + return handler.process(text) diff --git a/app/core/quality_presets.py b/app/core/quality_presets.py new file mode 100644 index 0000000..3ead838 --- /dev/null +++ b/app/core/quality_presets.py @@ -0,0 +1,44 @@ +"""Quality presets for TTS generation configurable via environment variables.""" + +from app.config import Config + + +def get_quality_preset(preset_name: str | None = None) -> dict: + """Return the quality preset configuration. + + Args: + preset_name: Name of the preset to retrieve. Defaults to the configured + LONG_TEXT_QUALITY_PRESET when ``None`` is provided. + + Returns: + A dictionary with ``chunk_size``, ``cfg_weight`` and ``temperature`` + settings. Falls back to the ``balanced`` preset when the requested name + is not defined. + """ + + if preset_name is None: + preset_name = Config.LONG_TEXT_QUALITY_PRESET + + return Config.QUALITY_PRESETS.get(preset_name, Config.QUALITY_PRESETS["balanced"]) + + +def get_chunk_size_for_preset(preset_name: str | None = None) -> int: + """Return the chunk size associated with a preset. + + Args: + preset_name: Optional preset name. When omitted the configured default + preset is used. + + Returns: + Chunk size for the preset, falling back to ``Config.LONG_TEXT_CHUNK_SIZE`` + if the preset is not found or does not define a chunk size. + """ + + preset = get_quality_preset(preset_name) if preset_name else None + + if preset is not None: + chunk_size = preset.get("chunk_size") + if isinstance(chunk_size, int) and chunk_size > 0: + return chunk_size + + return Config.LONG_TEXT_CHUNK_SIZE diff --git a/app/core/text_processing.py b/app/core/text_processing.py index d90c90f..9c2a71c 100644 --- a/app/core/text_processing.py +++ b/app/core/text_processing.py @@ -380,57 +380,158 @@ def concatenate_audio_chunks(audio_chunks: list, sample_rate: int) -> torch.Tens return concatenated -def split_text_for_long_generation(text: str, - max_chunk_size: Optional[int] = None, - overlap_chars: int = 0) -> List[LongTextChunk]: - """ - Split long text into chunks optimized for TTS generation with intelligent boundaries. +def chunk_text(text: str, strategy: str = "sentence", max_length: Optional[int] = None) -> List[str]: + """Split text according to the requested strategy.""" - This function implements a hierarchical splitting strategy: - 1. First attempt: Split at paragraph boundaries (double newlines) - 2. Second attempt: Split at sentence boundaries (. ! ?) - 3. Third attempt: Split at clause boundaries (, ; : - —) - 4. Last resort: Split at word boundaries + if max_length is None or max_length <= 0: + max_length = Config.LONG_TEXT_CHUNK_SIZE - Args: - text: Input text to split (should be > 3000 characters) - max_chunk_size: Maximum characters per chunk (defaults to Config.LONG_TEXT_CHUNK_SIZE) - overlap_chars: Number of characters to overlap between chunks for context + # Respect the standard TTS hard limit with a small buffer to avoid boundary errors + effective_max = min(max_length, Config.MAX_TOTAL_LENGTH - 100) + if effective_max <= 0: + effective_max = max_length - Returns: - List of LongTextChunk objects with metadata - """ - if max_chunk_size is None: - max_chunk_size = Config.LONG_TEXT_CHUNK_SIZE + cleaned = text.strip() + if not cleaned: + return [] - # Ensure we don't exceed the regular TTS limit - effective_max = min(max_chunk_size, Config.MAX_TOTAL_LENGTH - 100) # Leave some buffer + normalized_strategy = (strategy or "sentence").lower() - chunks = [] - chunk_index = 0 + if normalized_strategy == "fixed": + return [ + chunk.strip() + for chunk in (cleaned[i : i + effective_max] for i in range(0, len(cleaned), effective_max)) + if chunk.strip() + ] + + if normalized_strategy == "paragraph": + return _chunk_by_paragraphs(cleaned, effective_max) + + if normalized_strategy == "word": + return _chunk_by_words(cleaned, effective_max) + + # Default strategy combines hierarchical paragraph/sentence/word splitting + return _chunk_hierarchical(cleaned, effective_max) + + +def _chunk_by_paragraphs(text: str, max_length: int) -> List[str]: + """Chunk text prioritising paragraph boundaries.""" + + paragraphs = [segment.strip() for segment in re.split(r"\n\s*\n", text) if segment.strip()] + if not paragraphs: + return _chunk_hierarchical(text, max_length) + + chunks: List[str] = [] + current: Optional[str] = None + + for paragraph in paragraphs: + if current is None: + current = paragraph + continue + + candidate = f"{current}\n\n{paragraph}" + if len(candidate) <= max_length: + current = candidate + else: + chunks.extend(_chunk_hierarchical(current, max_length)) + current = paragraph + + if current: + chunks.extend(_chunk_hierarchical(current, max_length)) + + return chunks + + +def _chunk_by_words(text: str, max_length: int) -> List[str]: + """Chunk text using word boundaries.""" + + words = text.split() + if not words: + return [] + + chunks: List[str] = [] + current_words: List[str] = [] + current_length = 0 + + for word in words: + # Include a space when the current chunk already has words + additional_length = len(word) if not current_words else len(word) + 1 + + if current_words and current_length + additional_length > max_length: + chunk = " ".join(current_words).strip() + if chunk: + chunks.append(chunk) + current_words = [word] + current_length = len(word) + else: + current_words.append(word) + current_length += additional_length + + if current_words: + chunk = " ".join(current_words).strip() + if chunk: + chunks.append(chunk) + + refined_chunks: List[str] = [] + for chunk in chunks: + if len(chunk) > max_length: + refined_chunks.extend(_chunk_hierarchical(chunk, max_length)) + else: + refined_chunks.append(chunk) + + return refined_chunks + + +def _chunk_hierarchical(text: str, max_length: int) -> List[str]: + """Hierarchical chunking that mirrors the legacy behaviour.""" + + chunks: List[str] = [] remaining_text = text.strip() while remaining_text: - if len(remaining_text) <= effective_max: - # Last chunk - chunk_text = remaining_text - remaining_text = "" - else: - # Find the best split point - chunk_text, remaining_text = _find_best_split_point( - remaining_text, effective_max, overlap_chars - ) + if len(remaining_text) <= max_length: + chunks.append(remaining_text) + break - # Create chunk metadata - chunk = LongTextChunk( - index=chunk_index, - text=chunk_text, - text_preview=chunk_text[:50] + ("..." if len(chunk_text) > 50 else ""), - character_count=len(chunk_text) - ) + chunk_text, remaining = _find_best_split_point(remaining_text, max_length, 0) + + if not chunk_text: + chunk_text = remaining_text[:max_length].strip() + remaining = remaining_text[max_length:].strip() + + chunks.append(chunk_text) + remaining_text = remaining - chunks.append(chunk) - chunk_index += 1 + return [chunk for chunk in chunks if chunk] + + +def split_text_for_long_generation( + text: str, + max_chunk_size: Optional[int] = None, + strategy: Optional[str] = None, +) -> List[LongTextChunk]: + """Split long text into structured chunks ready for generation.""" + + if max_chunk_size is None or max_chunk_size <= 0: + max_chunk_size = Config.LONG_TEXT_CHUNK_SIZE + + resolved_strategy = strategy or Config.LONG_TEXT_CHUNKING_STRATEGY + + chunk_strings = chunk_text(text, strategy=resolved_strategy, max_length=max_chunk_size) + if not chunk_strings: + return [] + + chunks: List[LongTextChunk] = [] + for index, chunk_body in enumerate(chunk_strings): + preview = chunk_body[:50] + ("..." if len(chunk_body) > 50 else "") + chunks.append( + LongTextChunk( + index=index, + text=chunk_body, + text_preview=preview, + character_count=len(chunk_body), + ) + ) return chunks @@ -562,13 +663,18 @@ def _split_at_words(text: str, max_length: int, overlap_chars: int) -> Tuple[str return chunk_text, remaining_text -def estimate_processing_time(text_length: int, avg_chars_per_second: float = 25.0) -> int: +def estimate_processing_time( + text_length: int, + avg_chars_per_second: float = 25.0, + chunk_size: Optional[int] = None, +) -> int: """ Estimate processing time for long text TTS generation. Args: text_length: Total characters in text avg_chars_per_second: Average processing rate (characters per second) + chunk_size: Optional chunk size override used for estimation Returns: Estimated processing time in seconds @@ -576,8 +682,12 @@ def estimate_processing_time(text_length: int, avg_chars_per_second: float = 25. # Base estimate + overhead for chunking and concatenation base_time = text_length / avg_chars_per_second + effective_chunk_size = chunk_size or Config.LONG_TEXT_CHUNK_SIZE + if effective_chunk_size <= 0: + effective_chunk_size = Config.LONG_TEXT_CHUNK_SIZE + # Add overhead: 5 seconds for setup + 2 seconds per chunk + 10 seconds for concatenation - num_chunks = max(1, (text_length + Config.LONG_TEXT_CHUNK_SIZE - 1) // Config.LONG_TEXT_CHUNK_SIZE) + num_chunks = max(1, (text_length + effective_chunk_size - 1) // effective_chunk_size) overhead = 5 + (num_chunks * 2) + 10 return int(base_time + overhead) @@ -594,12 +704,19 @@ def validate_long_text_input(text: str) -> Tuple[bool, str]: return False, "Input text cannot be empty" text_length = len(text.strip()) + min_length = Config.get_long_text_min_length() + max_length = Config.get_long_text_max_length() + + if text_length < min_length: + return False, ( + "Text must be at least {} characters for long-text processing (received {} characters)".format( + min_length, + text_length, + ) + ) - if text_length <= Config.MAX_TOTAL_LENGTH: - return False, f"Text is {text_length} characters. Use regular TTS for texts under {Config.MAX_TOTAL_LENGTH} characters" - - if text_length > Config.LONG_TEXT_MAX_LENGTH: - return False, f"Text is too long ({text_length} characters). Maximum allowed: {Config.LONG_TEXT_MAX_LENGTH}" + if text_length > max_length: + return False, f"Text is too long ({text_length} characters). Maximum allowed: {max_length}" # Check for excessive repetition (potential spam/abuse) words = text.split() diff --git a/app/models/long_text.py b/app/models/long_text.py index 08379e2..636c81f 100644 --- a/app/models/long_text.py +++ b/app/models/long_text.py @@ -4,10 +4,12 @@ from datetime import datetime from enum import Enum -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any, List, Literal from pydantic import BaseModel, Field, field_validator from uuid import UUID +from app.config import Config + class LongTextJobStatus(str, Enum): """Status enum for long text jobs""" @@ -28,20 +30,116 @@ class LongTextJobActionType(str, Enum): class LongTextRequest(BaseModel): """Request model for long text TTS generation""" - input: str = Field(..., min_length=3001, description="Text to convert to speech (must be > 3000 characters)") + input: str = Field( + ..., description="Text to convert to speech (must meet the configured minimum length)" + ) voice: Optional[str] = Field(None, description="Voice name from library or OpenAI voice name") response_format: Optional[str] = Field("mp3", description="Audio format (mp3 or wav)") exaggeration: Optional[float] = Field(None, ge=0.25, le=2.0, description="Emotion intensity") cfg_weight: Optional[float] = Field(None, ge=0.0, le=1.0, description="Pace control") temperature: Optional[float] = Field(None, ge=0.05, le=5.0, description="Sampling temperature") session_id: Optional[str] = Field(None, description="Frontend session ID for tracking") + chunking_strategy: Optional[Literal["sentence", "paragraph", "word", "fixed"]] = Field( + None, description="Strategy to use when chunking the text" + ) + quality_preset: Optional[Literal["fast", "balanced", "high"]] = Field( + None, description="Quality preset balancing speed vs fidelity" + ) + chunk_size: Optional[int] = Field( + None, + gt=0, + description="Custom chunk size override (takes precedence over presets and defaults)", + ) + silence_padding_ms: Optional[int] = Field( + None, + ge=0, + description="Silence padding between chunks in milliseconds", + ) + enable_pauses: Optional[bool] = Field( + None, + description="Enable punctuation-based pauses when generating chunk audio", + ) + custom_pauses: Optional[Dict[str, int]] = Field( + None, + description="Custom pause patterns and durations in milliseconds", + ) @field_validator('input') @classmethod - def validate_input_length(cls, v): - if len(v) > 100000: # Will be validated against Config.LONG_TEXT_MAX_LENGTH at runtime - raise ValueError('Input text exceeds maximum length of 100000 characters') - return v.strip() + def validate_input_length(cls, v: str) -> str: + cleaned = v.strip() + text_length = len(cleaned) + min_length = Config.get_long_text_min_length() + max_length = Config.get_long_text_max_length() + + if text_length < min_length: + raise ValueError( + f"Input text must be at least {min_length} characters for long text processing" + ) + + if text_length > max_length: + raise ValueError( + f"Input text exceeds maximum length of {max_length} characters" + ) + + return cleaned + + def get_chunking_strategy(self) -> str: + """Return the requested chunking strategy with configuration fallback.""" + + return self.chunking_strategy or Config.LONG_TEXT_CHUNKING_STRATEGY + + def get_quality_preset(self) -> str: + """Return the requested quality preset with configuration fallback.""" + + return self.quality_preset or Config.LONG_TEXT_QUALITY_PRESET + + def get_chunk_size(self, preset_config: Dict[str, Any]) -> int: + """Resolve the chunk size using custom value, preset, then config.""" + + if self.chunk_size: + return self.chunk_size + return int(preset_config.get("chunk_size", Config.LONG_TEXT_CHUNK_SIZE)) + + def get_silence_padding(self) -> int: + """Resolve the silence padding with fallback to configuration.""" + + if self.silence_padding_ms is not None: + return self.silence_padding_ms + return Config.LONG_TEXT_SILENCE_PADDING_MS + + def resolve_pause_settings(self) -> Dict[str, Any]: + """Return pause handling configuration with defaults applied.""" + + enable = ( + self.enable_pauses + if self.enable_pauses is not None + else Config.ENABLE_PUNCTUATION_PAUSES + ) + return { + "enable": bool(enable), + "custom": self.custom_pauses or None, + } + + @field_validator('custom_pauses') + @classmethod + def validate_custom_pauses(cls, value: Optional[Dict[str, Any]]): + if value is None: + return value + + cleaned: Dict[str, int] = {} + for key, duration in value.items(): + if duration is None: + raise ValueError(f'Pause duration for {key!r} cannot be None') + try: + int_duration = int(duration) + except (TypeError, ValueError) as exc: + raise ValueError(f'Invalid pause duration for {key!r}: {duration!r}') from exc + if int_duration < 0: + raise ValueError(f'Pause duration for {key!r} must be non-negative') + cleaned[str(key)] = int_duration + + return cleaned class LongTextChunk(BaseModel): @@ -63,7 +161,7 @@ class LongTextJobMetadata(BaseModel): created_at: datetime = Field(default_factory=datetime.utcnow) updated_at: datetime = Field(default_factory=datetime.utcnow) status: LongTextJobStatus = Field(default=LongTextJobStatus.PENDING) - text_length: int = Field(..., ge=3001, description="Total characters in input text") + text_length: int = Field(..., ge=1, description="Total characters in input text") text_hash: str = Field(..., description="SHA256 hash of input text for deduplication") total_chunks: int = Field(..., ge=1, description="Total number of chunks") completed_chunks: int = Field(default=0, ge=0, description="Number of completed chunks") diff --git a/app/models/requests.py b/app/models/requests.py index e4cd6ad..8ef304f 100644 --- a/app/models/requests.py +++ b/app/models/requests.py @@ -1,8 +1,7 @@ -""" -Request models for API validation -""" +"""Request models for API validation""" + +from typing import Dict, Optional -from typing import Optional from pydantic import BaseModel, Field, validator @@ -25,6 +24,16 @@ class TTSRequest(BaseModel): streaming_strategy: Optional[str] = Field(None, description="Chunking strategy for streaming") streaming_buffer_size: Optional[int] = Field(None, description="Number of chunks to buffer", ge=1, le=10) streaming_quality: Optional[str] = Field(None, description="Speed vs quality trade-off") + + # Pause handling parameters + enable_pauses: Optional[bool] = Field( + None, + description="Enable punctuation-based pauses (defaults to server configuration)", + ) + custom_pauses: Optional[Dict[str, int]] = Field( + None, + description="Custom pause durations in milliseconds keyed by punctuation", + ) @validator('input') def validate_input(cls, v): @@ -54,4 +63,23 @@ def validate_streaming_quality(cls, v): allowed_qualities = ['fast', 'balanced', 'high'] if v not in allowed_qualities: raise ValueError(f'streaming_quality must be one of: {", ".join(allowed_qualities)}') - return v \ No newline at end of file + return v + + @validator('custom_pauses') + def validate_custom_pauses(cls, value): + if value is None: + return value + + cleaned: Dict[str, int] = {} + for key, duration in value.items(): + if duration is None: + raise ValueError('custom pause duration cannot be None') + try: + int_duration = int(duration) + except (TypeError, ValueError) as exc: + raise ValueError(f'invalid pause duration for {key!r}: {duration!r}') from exc + if int_duration < 0: + raise ValueError(f'pause duration for {key!r} must be non-negative') + cleaned[str(key)] = int_duration + + return cleaned \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index a056a48..888eb64 100755 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -63,6 +63,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # Expose port EXPOSE ${PORT} @@ -72,4 +73,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application with the new entry point -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/docker/Dockerfile.blackwell b/docker/Dockerfile.blackwell index 0c7d963..089f979 100644 --- a/docker/Dockerfile.blackwell +++ b/docker/Dockerfile.blackwell @@ -88,6 +88,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # NVIDIA/CUDA environment variables ENV NVIDIA_VISIBLE_DEVICES=all @@ -101,4 +102,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application using the new entry point -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 9502a22..e81d0ff 100755 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -63,6 +63,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # Expose port EXPOSE ${PORT} @@ -72,4 +73,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application with the new entry point -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index 0bea747..b3dcd94 100755 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -71,6 +71,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # NVIDIA/CUDA environment variables ENV NVIDIA_VISIBLE_DEVICES=all @@ -84,4 +85,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application with the new entry point -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/docker/Dockerfile.uv b/docker/Dockerfile.uv index 80d4363..923d796 100644 --- a/docker/Dockerfile.uv +++ b/docker/Dockerfile.uv @@ -79,6 +79,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # Expose port EXPOSE ${PORT} @@ -88,4 +89,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application using the new entry point -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/docker/Dockerfile.uv.gpu b/docker/Dockerfile.uv.gpu index fd95717..fe6138a 100644 --- a/docker/Dockerfile.uv.gpu +++ b/docker/Dockerfile.uv.gpu @@ -78,6 +78,7 @@ ENV LONG_TEXT_CHUNK_SIZE=2500 ENV LONG_TEXT_SILENCE_PADDING_MS=200 ENV LONG_TEXT_JOB_RETENTION_DAYS=7 ENV LONG_TEXT_MAX_CONCURRENT_JOBS=3 +ENV LONG_TEXT_MIN_LENGTH=100 # NVIDIA/CUDA environment variables ENV NVIDIA_VISIBLE_DEVICES=all @@ -91,4 +92,4 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5m --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Run the application using the virtual environment Python -CMD ["/app/.venv/bin/python", "main.py"] \ No newline at end of file +CMD ["/app/.venv/bin/python", "main.py"] diff --git a/docker/docker-compose.blackwell.yml b/docker/docker-compose.blackwell.yml index 240bcf1..7716ca8 100644 --- a/docker/docker-compose.blackwell.yml +++ b/docker/docker-compose.blackwell.yml @@ -23,6 +23,35 @@ services: - MAX_CHUNK_LENGTH=${MAX_CHUNK_LENGTH:-280} - MAX_TOTAL_LENGTH=${MAX_TOTAL_LENGTH:-3000} + # Long Text TTS Settings + - LONG_TEXT_DATA_DIR=${LONG_TEXT_DATA_DIR:-/data/long_text_jobs} + - LONG_TEXT_MAX_LENGTH=${LONG_TEXT_MAX_LENGTH:-100000} + - LONG_TEXT_CHUNK_SIZE=${LONG_TEXT_CHUNK_SIZE:-2500} + - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} + - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} + - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=100 + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-800} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-550} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-375} + - PERIOD_PAUSE_MS=${PERIOD_PAUSE_MS:-500} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-800} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-350} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-200} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} + # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 - DEVICE=${DEVICE:-cuda} diff --git a/docker/docker-compose.cpu.yml b/docker/docker-compose.cpu.yml index 482e2a9..8dc49c7 100755 --- a/docker/docker-compose.cpu.yml +++ b/docker/docker-compose.cpu.yml @@ -30,6 +30,26 @@ services: - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=${LONG_TEXT_MIN_LENGTH:-100} + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-600} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-400} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-350} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-500} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-250} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-100} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 diff --git a/docker/docker-compose.gpu.yml b/docker/docker-compose.gpu.yml index b37782f..57835f0 100755 --- a/docker/docker-compose.gpu.yml +++ b/docker/docker-compose.gpu.yml @@ -30,6 +30,26 @@ services: - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=${LONG_TEXT_MIN_LENGTH:-100} + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-600} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-400} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-350} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-500} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-250} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-100} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 diff --git a/docker/docker-compose.uv.gpu.yml b/docker/docker-compose.uv.gpu.yml index a5c2586..3e66f94 100644 --- a/docker/docker-compose.uv.gpu.yml +++ b/docker/docker-compose.uv.gpu.yml @@ -30,6 +30,26 @@ services: - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=${LONG_TEXT_MIN_LENGTH:-100} + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-600} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-400} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-350} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-500} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-250} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-100} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 diff --git a/docker/docker-compose.uv.yml b/docker/docker-compose.uv.yml index 5193b5d..212f72a 100644 --- a/docker/docker-compose.uv.yml +++ b/docker/docker-compose.uv.yml @@ -30,6 +30,26 @@ services: - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=${LONG_TEXT_MIN_LENGTH:-100} + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-600} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-400} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-350} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-500} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-250} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-100} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index cdf18ee..cf697f8 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -30,6 +30,26 @@ services: - LONG_TEXT_SILENCE_PADDING_MS=${LONG_TEXT_SILENCE_PADDING_MS:-200} - LONG_TEXT_JOB_RETENTION_DAYS=${LONG_TEXT_JOB_RETENTION_DAYS:-7} - LONG_TEXT_MAX_CONCURRENT_JOBS=${LONG_TEXT_MAX_CONCURRENT_JOBS:-3} + - LONG_TEXT_MIN_LENGTH=${LONG_TEXT_MIN_LENGTH:-100} + - LONG_TEXT_CHUNKING_STRATEGY=${LONG_TEXT_CHUNKING_STRATEGY:-sentence} + - LONG_TEXT_QUALITY_PRESET=${LONG_TEXT_QUALITY_PRESET:-balanced} + - QUALITY_FAST_CHUNK_SIZE=${QUALITY_FAST_CHUNK_SIZE:-1500} + - QUALITY_FAST_CFG_WEIGHT=${QUALITY_FAST_CFG_WEIGHT:-0.3} + - QUALITY_FAST_TEMPERATURE=${QUALITY_FAST_TEMPERATURE:-0.6} + - QUALITY_BALANCED_CHUNK_SIZE=${QUALITY_BALANCED_CHUNK_SIZE:-2500} + - QUALITY_BALANCED_CFG_WEIGHT=${QUALITY_BALANCED_CFG_WEIGHT:-0.5} + - QUALITY_BALANCED_TEMPERATURE=${QUALITY_BALANCED_TEMPERATURE:-0.8} + - QUALITY_HIGH_CHUNK_SIZE=${QUALITY_HIGH_CHUNK_SIZE:-2800} + - QUALITY_HIGH_CFG_WEIGHT=${QUALITY_HIGH_CFG_WEIGHT:-0.7} + - QUALITY_HIGH_TEMPERATURE=${QUALITY_HIGH_TEMPERATURE:-1.0} + - ENABLE_PUNCTUATION_PAUSES=${ENABLE_PUNCTUATION_PAUSES:-true} + - ELLIPSIS_PAUSE_MS=${ELLIPSIS_PAUSE_MS:-600} + - EM_DASH_PAUSE_MS=${EM_DASH_PAUSE_MS:-400} + - EN_DASH_PAUSE_MS=${EN_DASH_PAUSE_MS:-350} + - PARAGRAPH_PAUSE_MS=${PARAGRAPH_PAUSE_MS:-500} + - LINE_BREAK_PAUSE_MS=${LINE_BREAK_PAUSE_MS:-250} + - MIN_PAUSE_MS=${MIN_PAUSE_MS:-100} + - MAX_PAUSE_MS=${MAX_PAUSE_MS:-2000} # Voice and Model Settings - VOICE_SAMPLE_PATH=/app/voice-sample.mp3 diff --git a/frontend/src/services/longTextTTS.ts b/frontend/src/services/longTextTTS.ts index 688dd68..aa81ab7 100644 --- a/frontend/src/services/longTextTTS.ts +++ b/frontend/src/services/longTextTTS.ts @@ -47,6 +47,22 @@ export const createLongTextTTSService = (baseUrl: string, sessionId?: string) => payload.response_format = request.output_format; // Backend expects 'response_format' } + if (request.silence_padding_ms !== undefined) { + payload.silence_padding_ms = request.silence_padding_ms; + } + + if (request.chunking_strategy) { + payload.chunking_strategy = request.chunking_strategy; + } + + if (request.quality_preset) { + payload.quality_preset = request.quality_preset; + } + + if (request.chunk_size !== undefined) { + payload.chunk_size = request.chunk_size; + } + // Add session ID for tracking if (sessionId) { payload.session_id = sessionId; diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 4d6fae9..f95bdfd 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -10,6 +10,8 @@ export interface TTSRequest { streaming_chunk_size?: number; streaming_strategy?: 'sentence' | 'paragraph' | 'fixed' | 'word'; streaming_quality?: 'fast' | 'balanced' | 'high'; + enable_pauses?: boolean; + custom_pauses?: Record; } export interface HealthResponse { @@ -262,10 +264,16 @@ export interface LongTextJobMetadata { }; voice: string; parameters: { - exaggeration: number; - cfg_weight: number; - temperature: number; - language: string; + exaggeration?: number; + cfg_weight?: number; + temperature?: number; + output_format?: string; + chunking_strategy?: 'sentence' | 'paragraph' | 'word' | 'fixed'; + chunk_size?: number; + quality_preset?: 'fast' | 'balanced' | 'high'; + silence_padding_ms?: number; + language?: string; + [key: string]: unknown; }; processing: { started_at?: string; @@ -328,7 +336,12 @@ export interface LongTextRequest { voice_file?: File; output_format?: 'mp3' | 'wav'; silence_padding_ms?: number; + chunking_strategy?: 'sentence' | 'paragraph' | 'word' | 'fixed'; + quality_preset?: 'fast' | 'balanced' | 'high'; + chunk_size?: number; session_id?: string; + enable_pauses?: boolean; + custom_pauses?: Record; } export interface LongTextSSEEvent { diff --git a/tests/test_pause_handler.py b/tests/test_pause_handler.py new file mode 100644 index 0000000..feb72de --- /dev/null +++ b/tests/test_pause_handler.py @@ -0,0 +1,129 @@ +import pytest + +from app.core.pause_handler import PauseHandler, split_text_with_pauses, TextChunk + + +class TestPauseHandler: + def test_basic_ellipsis_pause(self): + handler = PauseHandler() + chunks = handler.process("Hello... world") + + assert len(chunks) == 2 + assert chunks[0].text == "Hello" + assert chunks[0].pause_after_ms == 600 + assert chunks[1].text == "world" + assert chunks[1].pause_after_ms == 0 + + def test_em_dash_pause(self): + handler = PauseHandler() + chunks = handler.process("Hello—world") + + assert len(chunks) == 2 + assert chunks[0].text == "Hello" + assert chunks[0].pause_after_ms == 400 + assert chunks[1].text == "world" + + def test_en_dash_pause(self): + handler = PauseHandler() + chunks = handler.process("Numbers 1–2") + + assert len(chunks) == 2 + assert chunks[0].text == "Numbers 1" + assert chunks[0].pause_after_ms == 350 + assert chunks[1].text == "2" + + def test_multiple_pauses(self): + handler = PauseHandler() + text = "Hello... I was thinking—maybe tomorrow?" + chunks = handler.process(text) + + assert len(chunks) == 3 + assert chunks[0].text == "Hello" + assert chunks[0].pause_after_ms == 600 + assert chunks[1].text == "I was thinking" + assert chunks[1].pause_after_ms == 400 + assert chunks[2].text == "maybe tomorrow?" + assert chunks[2].pause_after_ms == 0 + + def test_no_pauses_when_disabled(self): + handler = PauseHandler(enable_pauses=False) + chunks = handler.process("Hello... world—test") + + assert len(chunks) == 1 + assert chunks[0].text == "Hello... world—test" + assert chunks[0].pause_after_ms == 0 + + def test_line_break_pause(self): + handler = PauseHandler() + chunks = handler.process("Line one\nLine two") + + assert len(chunks) == 2 + assert chunks[0].text == "Line one" + assert chunks[0].pause_after_ms == 250 + assert chunks[1].text == "Line two" + + def test_paragraph_break_pause(self): + handler = PauseHandler() + chunks = handler.process("Paragraph one\n\nParagraph two") + + assert len(chunks) == 2 + assert chunks[0].text == "Paragraph one" + assert chunks[0].pause_after_ms == 500 + assert chunks[1].text == "Paragraph two" + + def test_custom_pause_durations(self): + custom = {r"\.\.\.": 1000} + handler = PauseHandler(custom_pauses=custom) + chunks = handler.process("Hello... world") + + assert chunks[0].pause_after_ms == 1000 + + def test_pause_clamping(self): + handler = PauseHandler(min_pause_ms=200, max_pause_ms=500) + custom = {r"\.\.\.": 2000} + handler.pause_patterns.update(custom) + + chunks = handler.process("Hello... world") + assert chunks[0].pause_after_ms == 500 + + def test_empty_text(self): + handler = PauseHandler() + chunks = handler.process("") + + assert len(chunks) == 0 + + def test_no_pause_punctuation(self): + handler = PauseHandler() + chunks = handler.process("Hello world, how are you?") + + assert len(chunks) == 1 + assert chunks[0].text == "Hello world, how are you?" + assert chunks[0].pause_after_ms == 0 + + def test_estimate_total_pause_time(self): + handler = PauseHandler() + text = "Hello... world—test" + total_pause = handler.estimate_total_pause_time(text) + + assert total_pause == 1000 + + def test_pause_summary(self): + handler = PauseHandler() + text = "Hello... world—test... again" + summary = handler.get_pause_summary(text) + + assert summary['total_chunks'] == 4 + assert summary['chunks_with_pauses'] == 3 + assert summary['pause_types']['...'] == 2 + assert summary['pause_types']['—'] == 1 + + def test_convenience_function(self): + chunks = split_text_with_pauses("Hello... world") + + assert len(chunks) == 2 + assert isinstance(chunks[0], TextChunk) + assert chunks[0].pause_after_ms == 600 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/unit_tests/test_long_text_limits.py b/unit_tests/test_long_text_limits.py new file mode 100644 index 0000000..001da5a --- /dev/null +++ b/unit_tests/test_long_text_limits.py @@ -0,0 +1,65 @@ +"""Unit tests for configurable long text limits""" + +from pathlib import Path +from typing import Iterator +import sys + +import pytest +from pydantic import ValidationError + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from app.config import Config +from app.core.text_processing import validate_long_text_input +from app.models.long_text import LongTextRequest + + +@pytest.fixture() +def reset_long_text_limits() -> Iterator[None]: + """Restore Config long text limits after each test""" + original_min = Config.LONG_TEXT_MIN_LENGTH + original_max = Config.LONG_TEXT_MAX_LENGTH + yield + Config.LONG_TEXT_MIN_LENGTH = original_min + Config.LONG_TEXT_MAX_LENGTH = original_max + + +def test_long_text_request_accepts_env_configured_min(monkeypatch: pytest.MonkeyPatch, reset_long_text_limits: None) -> None: + """Long text requests should accept inputs that meet the configured minimum""" + monkeypatch.setenv("LONG_TEXT_MIN_LENGTH", "100") + monkeypatch.setenv("LONG_TEXT_MAX_LENGTH", "1000") + + sample_text = "x" * 150 + request = LongTextRequest(input=sample_text) + + assert request.input == sample_text + + +def test_long_text_request_rejects_below_min(monkeypatch: pytest.MonkeyPatch, reset_long_text_limits: None) -> None: + """Validation should fail when text length is below the configured minimum""" + monkeypatch.setenv("LONG_TEXT_MIN_LENGTH", "200") + monkeypatch.setenv("LONG_TEXT_MAX_LENGTH", "1000") + + with pytest.raises(ValidationError) as exc_info: + LongTextRequest(input="y" * 150) + + assert "200" in str(exc_info.value) + + +def test_validate_long_text_input_uses_configured_limits(monkeypatch: pytest.MonkeyPatch, reset_long_text_limits: None) -> None: + """Core validation should leverage runtime configuration for min and max lengths""" + monkeypatch.setenv("LONG_TEXT_MIN_LENGTH", "120") + monkeypatch.setenv("LONG_TEXT_MAX_LENGTH", "400") + + is_valid, _ = validate_long_text_input("z" * 200) + assert is_valid + + too_short_valid, too_short_message = validate_long_text_input("z" * 100) + assert not too_short_valid + assert "120" in too_short_message + + too_long_valid, too_long_message = validate_long_text_input("z" * 450) + assert not too_long_valid + assert "400" in too_long_message