Skip to content

Commit b8c3ee2

Browse files
committed
Overhauled speech speed, removing the need for ffmpeg
1 parent 1dd283b commit b8c3ee2

File tree

5 files changed

+71
-26
lines changed

5 files changed

+71
-26
lines changed

.env.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ PORT=5050
33

44
DEFAULT_VOICE=en-US-AndrewNeural
55
DEFAULT_RESPONSE_FORMAT=mp3
6-
DEFAULT_SPEED=1.0
6+
DEFAULT_SPEED=1.2
77

88
DEFAULT_LANGUAGE=en-US
99

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ This project provides a local, OpenAI-compatible text-to-speech (TTS) API using
3131

3232
- **Docker** (recommended): Docker and Docker Compose for containerized setup.
3333
- **Python** (optional): For local development, install dependencies in `requirements.txt`.
34-
- **ffmpeg**: Required for audio format conversion and playback speed adjustments.
34+
- **ffmpeg** (optional): Required for audio format conversion and playback speed adjustments. Optional if sticking to mp3.
3535

3636
### Installation
3737

@@ -48,7 +48,7 @@ PORT=5050
4848
4949
DEFAULT_VOICE=en-US-AndrewNeural
5050
DEFAULT_RESPONSE_FORMAT=mp3
51-
DEFAULT_SPEED=1.0
51+
DEFAULT_SPEED=1.2
5252
5353
DEFAULT_LANGUAGE=en-US
5454
@@ -130,7 +130,7 @@ PORT=5050
130130
131131
DEFAULT_VOICE=en-US-AndrewNeural
132132
DEFAULT_RESPONSE_FORMAT=mp3
133-
DEFAULT_SPEED=1.0
133+
DEFAULT_SPEED=1.2
134134
135135
DEFAULT_LANGUAGE=en-US
136136
@@ -167,7 +167,7 @@ Generates audio from the input text. Available parameters:
167167
- **model** (string): Set to "tts-1" or "tts-1-hd" (default: `"tts-1"`).
168168
- **voice** (string): One of the OpenAI-compatible voices (alloy, echo, fable, onyx, nova, shimmer) or any valid `edge-tts` voice (default: `"en-US-AndrewNeural"`).
169169
- **response_format** (string): Audio format. Options: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` (default: `mp3`).
170-
- **speed** (number): Playback speed (0.25 to 4.0). Default is `1.0`.
170+
- **speed** (number): Playback speed (0.25 to 4.0). Default is `1.2`.
171171

172172
Example request with `curl` and saving the output to an mp3 file:
173173

@@ -179,7 +179,7 @@ curl -X POST http://localhost:5050/v1/audio/speech \
179179
"input": "Hello, I am your AI assistant! Just let me know how I can help bring your ideas to life.",
180180
"voice": "echo",
181181
"response_format": "mp3",
182-
"speed": 1.0
182+
"speed": 1.2
183183
}' \
184184
--output speech.mp3
185185
```

app/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AndrewNeural')
1818
DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
19-
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))
19+
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.2))
2020

2121
# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
2222

app/tts_handler.py

Lines changed: 63 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import tempfile
44
import subprocess
55
import os
6+
from pathlib import Path
67

78
# Language default (environment variable)
89
DEFAULT_LANGUAGE = os.getenv('DEFAULT_LANGUAGE', 'en-US')
@@ -26,46 +27,71 @@ def is_ffmpeg_installed():
2627
return False
2728

2829
async def _generate_audio(text, voice, response_format, speed):
30+
"""Generate TTS audio and optionally convert to a different format."""
2931
# Determine if the voice is an OpenAI-compatible voice or a direct edge-tts voice
3032
edge_tts_voice = voice_mapping.get(voice, voice) # Use mapping if in OpenAI names, otherwise use as-is
3133

3234
# Generate the TTS output in mp3 format first
3335
temp_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
34-
communicator = edge_tts.Communicate(text, edge_tts_voice)
36+
37+
# Convert speed to SSML rate format
38+
try:
39+
speed_rate = speed_to_rate(speed) # Convert speed value to "+X%" or "-X%"
40+
except Exception as e:
41+
print(f"Error converting speed: {e}. Defaulting to +0%.")
42+
speed_rate = "+0%"
43+
44+
# Generate the MP3 file
45+
communicator = edge_tts.Communicate(text=text, voice=edge_tts_voice, rate=speed_rate)
3546
await communicator.save(temp_output_file.name)
3647

37-
# If the requested format is mp3 and speed is 1.0, return the generated file directly
38-
if response_format == "mp3" and speed == 1.0:
48+
# If the requested format is mp3, return the generated file directly
49+
if response_format == "mp3":
3950
return temp_output_file.name
4051

4152
# Check if FFmpeg is installed
42-
ffmpeg_available = is_ffmpeg_installed()
43-
44-
# If FFmpeg is not available, return the generated mp3 file without conversion
45-
if not ffmpeg_available:
46-
print("FFmpeg not available. Returning unmodified mp3 file.")
53+
if not is_ffmpeg_installed():
54+
print("FFmpeg is not available. Returning unmodified mp3 file.")
4755
return temp_output_file.name
4856

49-
# Convert to the requested format if FFmpeg is available
57+
# Create a new temporary file for the converted output
5058
converted_output_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{response_format}")
5159

52-
# ffmpeg playback speed adjustment
53-
speed_filter = f"atempo={speed}" if response_format != "pcm" else f"asetrate=44100*{speed},aresample=44100"
60+
# Build the FFmpeg command
5461
ffmpeg_command = [
55-
"ffmpeg", "-i", temp_output_file.name,
56-
"-filter:a", speed_filter, # Apply speed adjustment
57-
"-f", response_format, "-y",
58-
converted_output_file.name
62+
"ffmpeg",
63+
"-i", temp_output_file.name, # Input file
64+
"-c:a", {
65+
"aac": "aac",
66+
"mp3": "libmp3lame",
67+
"wav": "pcm_s16le",
68+
"opus": "libopus",
69+
"flac": "flac"
70+
}.get(response_format, "aac"), # Default to AAC if unknown
71+
"-b:a", "192k" if response_format != "wav" else None, # Bitrate not needed for WAV
72+
"-f", {
73+
"aac": "mp4", # AAC in MP4 container
74+
"mp3": "mp3",
75+
"wav": "wav",
76+
"opus": "ogg",
77+
"flac": "flac"
78+
}.get(response_format, response_format), # Default to matching format
79+
"-y", # Overwrite without prompt
80+
converted_output_file.name # Output file
5981
]
6082

6183
try:
62-
subprocess.run(ffmpeg_command, check=True)
84+
# Run FFmpeg command and ensure no errors occur
85+
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
6386
except subprocess.CalledProcessError as e:
64-
raise RuntimeError(f"Error in audio conversion: {e}")
87+
raise RuntimeError(f"FFmpeg error during audio conversion: {e}")
88+
89+
# Clean up the original temporary file
90+
Path(temp_output_file.name).unlink(missing_ok=True)
6591

6692
return converted_output_file.name
6793

68-
def generate_speech(text, voice, response_format, speed=1.0):
94+
def generate_speech(text, voice, response_format, speed=1.2):
6995
return asyncio.run(_generate_audio(text, voice, response_format, speed))
7096

7197
def get_models():
@@ -86,3 +112,22 @@ async def _get_voices(language=None):
86112

87113
def get_voices(language=None):
88114
return asyncio.run(_get_voices(language))
115+
116+
def speed_to_rate(speed: float) -> str:
117+
"""
118+
Converts a multiplicative speed value to the edge-tts "rate" format.
119+
120+
Args:
121+
speed (float): The multiplicative speed value (e.g., 1.5 for +50%, 0.5 for -50%).
122+
123+
Returns:
124+
str: The formatted "rate" string (e.g., "+50%" or "-50%").
125+
"""
126+
if speed < 0 or speed > 2:
127+
raise ValueError("Speed must be between 0 and 2 (inclusive).")
128+
129+
# Convert speed to percentage change
130+
percentage_change = (speed - 1) * 100
131+
132+
# Format with a leading "+" or "-" as required
133+
return f"{percentage_change:+.0f}%"

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ services:
1010
PORT: ${PORT:-5050}
1111
DEFAULT_VOICE: ${DEFAULT_VOICE:-en-US-AndrewNeural}
1212
DEFAULT_RESPONSE_FORMAT: ${DEFAULT_RESPONSE_FORMAT:-mp3}
13-
DEFAULT_SPEED: ${DEFAULT_SPEED:-1.0}
13+
DEFAULT_SPEED: ${DEFAULT_SPEED:-1.2}
1414
DEFAULT_LANGUAGE: ${DEFAULT_LANGUAGE:-en-US}
1515
REQUIRE_API_KEY: ${REQUIRE_API_KEY:-True}

0 commit comments

Comments
 (0)