Overhauled speech speed, removing the need for ffmpeg

travisvn · travisvn · commit b8c3ee2bb8aa · 2024-11-13T23:20:36.000-05:00
diff --git a/.env.example b/.env.example
@@ -3,7 +3,7 @@ PORT=5050
 
 DEFAULT_VOICE=en-US-AndrewNeural
 DEFAULT_RESPONSE_FORMAT=mp3
-DEFAULT_SPEED=1.0
+DEFAULT_SPEED=1.2
 
 DEFAULT_LANGUAGE=en-US
 
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ This project provides a local, OpenAI-compatible text-to-speech (TTS) API using
 
 - **Docker** (recommended): Docker and Docker Compose for containerized setup.
 - **Python** (optional): For local development, install dependencies in `requirements.txt`.
-- **ffmpeg**: Required for audio format conversion and playback speed adjustments.
+- **ffmpeg** (optional): Required for audio format conversion and playback speed adjustments. Optional if sticking to mp3.
 
 ### Installation
 
@@ -48,7 +48,7 @@ PORT=5050
 
 DEFAULT_VOICE=en-US-AndrewNeural
 DEFAULT_RESPONSE_FORMAT=mp3
-DEFAULT_SPEED=1.0
+DEFAULT_SPEED=1.2
 
 DEFAULT_LANGUAGE=en-US
 
@@ -130,7 +130,7 @@ PORT=5050
 
 DEFAULT_VOICE=en-US-AndrewNeural
 DEFAULT_RESPONSE_FORMAT=mp3
-DEFAULT_SPEED=1.0
+DEFAULT_SPEED=1.2
 
 DEFAULT_LANGUAGE=en-US
 
@@ -167,7 +167,7 @@ Generates audio from the input text. Available parameters:
 - **model** (string): Set to "tts-1" or "tts-1-hd" (default: `"tts-1"`).
 - **voice** (string): One of the OpenAI-compatible voices (alloy, echo, fable, onyx, nova, shimmer) or any valid `edge-tts` voice (default: `"en-US-AndrewNeural"`).
 - **response_format** (string): Audio format. Options: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` (default: `mp3`).
-- **speed** (number): Playback speed (0.25 to 4.0). Default is `1.0`.
+- **speed** (number): Playback speed (0.25 to 4.0). Default is `1.2`.
 
 Example request with `curl` and saving the output to an mp3 file:
 
@@ -179,7 +179,7 @@ curl -X POST http://localhost:5050/v1/audio/speech \
     "input": "Hello, I am your AI assistant! Just let me know how I can help bring your ideas to life.",
     "voice": "echo",
     "response_format": "mp3",
-    "speed": 1.0
+    "speed": 1.2
   }' \
   --output speech.mp3
 ```
diff --git a/app/server.py b/app/server.py
@@ -16,7 +16,7 @@
 
 DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AndrewNeural')
 DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
-DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))
+DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.2))
 
 # DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
 
diff --git a/app/tts_handler.py b/app/tts_handler.py
@@ -3,6 +3,7 @@
 import tempfile
 import subprocess
 import os
+from pathlib import Path
 
 # Language default (environment variable)
 DEFAULT_LANGUAGE = os.getenv('DEFAULT_LANGUAGE', 'en-US')
@@ -26,46 +27,71 @@ def is_ffmpeg_installed():
         return False
 
 async def _generate_audio(text, voice, response_format, speed):
+    """Generate TTS audio and optionally convert to a different format."""
     # Determine if the voice is an OpenAI-compatible voice or a direct edge-tts voice
     edge_tts_voice = voice_mapping.get(voice, voice)  # Use mapping if in OpenAI names, otherwise use as-is
 
     # Generate the TTS output in mp3 format first
     temp_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-    communicator = edge_tts.Communicate(text, edge_tts_voice)
+
+    # Convert speed to SSML rate format
+    try:
+        speed_rate = speed_to_rate(speed)  # Convert speed value to "+X%" or "-X%"
+    except Exception as e:
+        print(f"Error converting speed: {e}. Defaulting to +0%.")
+        speed_rate = "+0%"
+
+    # Generate the MP3 file
+    communicator = edge_tts.Communicate(text=text, voice=edge_tts_voice, rate=speed_rate)
     await communicator.save(temp_output_file.name)
 
-    # If the requested format is mp3 and speed is 1.0, return the generated file directly
-    if response_format == "mp3" and speed == 1.0:
+    # If the requested format is mp3, return the generated file directly
+    if response_format == "mp3":
         return temp_output_file.name
 
     # Check if FFmpeg is installed
-    ffmpeg_available = is_ffmpeg_installed()
-
-    # If FFmpeg is not available, return the generated mp3 file without conversion
-    if not ffmpeg_available:
-        print("FFmpeg not available. Returning unmodified mp3 file.")
+    if not is_ffmpeg_installed():
+        print("FFmpeg is not available. Returning unmodified mp3 file.")
         return temp_output_file.name
 
-    # Convert to the requested format if FFmpeg is available
+    # Create a new temporary file for the converted output
     converted_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{response_format}")
 
-    # ffmpeg playback speed adjustment
-    speed_filter = f"atempo={speed}" if response_format != "pcm" else f"asetrate=44100*{speed},aresample=44100"
+    # Build the FFmpeg command
     ffmpeg_command = [
-        "ffmpeg", "-i", temp_output_file.name,
-        "-filter:a", speed_filter,  # Apply speed adjustment
-        "-f", response_format, "-y",
-        converted_output_file.name
+        "ffmpeg",
+        "-i", temp_output_file.name,  # Input file
+        "-c:a", {
+            "aac": "aac",
+            "mp3": "libmp3lame",
+            "wav": "pcm_s16le",
+            "opus": "libopus",
+            "flac": "flac"
+        }.get(response_format, "aac"),  # Default to AAC if unknown
+        "-b:a", "192k" if response_format != "wav" else None,  # Bitrate not needed for WAV
+        "-f", {
+            "aac": "mp4",  # AAC in MP4 container
+            "mp3": "mp3",
+            "wav": "wav",
+            "opus": "ogg",
+            "flac": "flac"
+        }.get(response_format, response_format),  # Default to matching format
+        "-y",  # Overwrite without prompt
+        converted_output_file.name  # Output file
     ]
 
     try:
-        subprocess.run(ffmpeg_command, check=True)
+        # Run FFmpeg command and ensure no errors occur
+        subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Error in audio conversion: {e}")
+        raise RuntimeError(f"FFmpeg error during audio conversion: {e}")
+
+    # Clean up the original temporary file
+    Path(temp_output_file.name).unlink(missing_ok=True)
 
     return converted_output_file.name
 
-def generate_speech(text, voice, response_format, speed=1.0):
+def generate_speech(text, voice, response_format, speed=1.2):
     return asyncio.run(_generate_audio(text, voice, response_format, speed))
 
 def get_models():
@@ -86,3 +112,22 @@ async def _get_voices(language=None):
 
 def get_voices(language=None):
     return asyncio.run(_get_voices(language))
+
+def speed_to_rate(speed: float) -> str:
+    """
+    Converts a multiplicative speed value to the edge-tts "rate" format.
+    
+    Args:
+        speed (float): The multiplicative speed value (e.g., 1.5 for +50%, 0.5 for -50%).
+    
+    Returns:
+        str: The formatted "rate" string (e.g., "+50%" or "-50%").
+    """
+    if speed < 0 or speed > 2:
+        raise ValueError("Speed must be between 0 and 2 (inclusive).")
+
+    # Convert speed to percentage change
+    percentage_change = (speed - 1) * 100
+
+    # Format with a leading "+" or "-" as required
+    return f"{percentage_change:+.0f}%"
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -10,6 +10,6 @@ services:
       PORT: ${PORT:-5050}
       DEFAULT_VOICE: ${DEFAULT_VOICE:-en-US-AndrewNeural}
       DEFAULT_RESPONSE_FORMAT: ${DEFAULT_RESPONSE_FORMAT:-mp3}
-      DEFAULT_SPEED: ${DEFAULT_SPEED:-1.0}
+      DEFAULT_SPEED: ${DEFAULT_SPEED:-1.2}
       DEFAULT_LANGUAGE: ${DEFAULT_LANGUAGE:-en-US}
       REQUIRE_API_KEY: ${REQUIRE_API_KEY:-True}