Skip to content

Commit 0f64d01

Browse files
committed
Added filtering for markdown with option to disable, added support for ElevenLabs and Azure AI Speech endpoints (currently in beta) (option to disable this expansion), and added option to not include '/v1' in the API endpoint to make setup easier
1 parent 09eb068 commit 0f64d01

File tree

5 files changed

+164
-3
lines changed

5 files changed

+164
-3
lines changed

.env.example

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,8 @@ DEFAULT_SPEED=1.2
77

88
DEFAULT_LANGUAGE=en-US
99

10-
REQUIRE_API_KEY=True
10+
REQUIRE_API_KEY=True
11+
12+
REMOVE_FILTER=False
13+
14+
EXPAND_API=True

app/handle_text.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import re
2+
import emoji
3+
4+
def prepare_tts_input_with_context(text: str) -> str:
5+
"""
6+
Prepares text for a TTS API by cleaning Markdown and adding minimal contextual hints
7+
for certain Markdown elements like headers. Preserves paragraph separation.
8+
9+
Args:
10+
text (str): The raw text containing Markdown or other formatting.
11+
12+
Returns:
13+
str: Cleaned text with contextual hints suitable for TTS input.
14+
"""
15+
16+
# Remove emojis
17+
text = emoji.replace_emoji(text, replace='')
18+
19+
# Add context for headers
20+
def header_replacer(match):
21+
level = len(match.group(1)) # Number of '#' symbols
22+
header_text = match.group(2).strip()
23+
if level == 1:
24+
return f"Title — {header_text}\n"
25+
elif level == 2:
26+
return f"Section — {header_text}\n"
27+
else:
28+
return f"Subsection — {header_text}\n"
29+
30+
text = re.sub(r"^(#{1,6})\s+(.*)", header_replacer, text, flags=re.MULTILINE)
31+
32+
# Announce links (currently commented out for potential future use)
33+
# text = re.sub(r"\[([^\]]+)\]\((https?:\/\/[^\)]+)\)", r"\1 (link: \2)", text)
34+
35+
# Remove links while keeping the link text
36+
text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
37+
38+
# Describe inline code
39+
text = re.sub(r"`([^`]+)`", r"code snippet: \1", text)
40+
41+
# Remove bold/italic symbols but keep the content
42+
text = re.sub(r"(\*\*|__|\*|_)", '', text)
43+
44+
# Remove code blocks (multi-line) with a description
45+
text = re.sub(r"```([\s\S]+?)```", r"(code block omitted)", text)
46+
47+
# Remove image syntax but add alt text if available
48+
text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"Image: \1", text)
49+
50+
# Remove HTML tags
51+
text = re.sub(r"</?[^>]+(>|$)", '', text)
52+
53+
# Normalize line breaks
54+
text = re.sub(r"\n{2,}", '\n\n', text) # Ensure consistent paragraph separation
55+
56+
# Replace multiple spaces within lines
57+
text = re.sub(r" {2,}", ' ', text)
58+
59+
# Trim leading and trailing whitespace from the whole text
60+
text = text.strip()
61+
62+
return text

app/server.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
from dotenv import load_dotenv
66
import os
77

8+
from handle_text import prepare_tts_input_with_context
89
from tts_handler import generate_speech, get_models, get_voices
9-
from utils import require_api_key, AUDIO_FORMAT_MIME_TYPES
10+
from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES
1011

1112
app = Flask(__name__)
1213
load_dotenv()
@@ -18,16 +19,24 @@
1819
DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
1920
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.2))
2021

22+
REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
23+
EXPAND_API = getenv_bool('EXPAND_API', True)
24+
2125
# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
2226

2327
@app.route('/v1/audio/speech', methods=['POST'])
28+
@app.route('/audio/speech', methods=['POST']) # Add this line for the alias
2429
@require_api_key
2530
def text_to_speech():
2631
data = request.json
2732
if not data or 'input' not in data:
2833
return jsonify({"error": "Missing 'input' in request body"}), 400
2934

3035
text = data.get('input')
36+
37+
if not REMOVE_FILTER:
38+
text = prepare_tts_input_with_context(text)
39+
3140
# model = data.get('model', DEFAULT_MODEL)
3241
voice = data.get('voice', DEFAULT_VOICE)
3342

@@ -43,11 +52,13 @@ def text_to_speech():
4352
return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")
4453

4554
@app.route('/v1/models', methods=['GET', 'POST'])
55+
@app.route('/models', methods=['GET', 'POST'])
4656
@require_api_key
4757
def list_models():
4858
return jsonify({"data": get_models()})
4959

5060
@app.route('/v1/voices', methods=['GET', 'POST'])
61+
@app.route('/voices', methods=['GET', 'POST'])
5162
@require_api_key
5263
def list_voices():
5364
specific_language = None
@@ -59,10 +70,91 @@ def list_voices():
5970
return jsonify({"voices": get_voices(specific_language)})
6071

6172
@app.route('/v1/voices/all', methods=['GET', 'POST'])
73+
@app.route('/voices/all', methods=['GET', 'POST'])
6274
@require_api_key
6375
def list_all_voices():
6476
return jsonify({"voices": get_voices('all')})
6577

78+
"""
79+
Support for ElevenLabs and Azure AI Speech
80+
(currently in beta)
81+
"""
82+
83+
# http://localhost:5050/elevenlabs/v1/text-to-speech
84+
# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural
85+
@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
86+
@require_api_key
87+
def elevenlabs_tts(voice_id):
88+
if not EXPAND_API:
89+
return jsonify({"error": f"Endpoint not allowed"}), 500
90+
91+
# Parse the incoming JSON payload
92+
try:
93+
payload = request.json
94+
if not payload or 'text' not in payload:
95+
return jsonify({"error": "Missing 'text' in request body"}), 400
96+
except Exception as e:
97+
return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400
98+
99+
text = payload['text']
100+
101+
if not REMOVE_FILTER:
102+
text = prepare_tts_input_with_context(text)
103+
104+
voice = voice_id # ElevenLabs uses the voice_id in the URL
105+
106+
# Use default settings for edge-tts
107+
response_format = 'mp3'
108+
speed = DEFAULT_SPEED # Optional customization via payload.get('speed', DEFAULT_SPEED)
109+
110+
# Generate speech using edge-tts
111+
try:
112+
output_file_path = generate_speech(text, voice, response_format, speed)
113+
except Exception as e:
114+
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
115+
116+
# Return the generated audio file
117+
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
118+
119+
# tts.speech.microsoft.com/cognitiveservices/v1
120+
# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
121+
# http://localhost:5050/azure/cognitiveservices/v1
122+
@app.route('/azure/cognitiveservices/v1', methods=['POST'])
123+
@require_api_key
124+
def azure_tts():
125+
if not EXPAND_API:
126+
return jsonify({"error": f"Endpoint not allowed"}), 500
127+
128+
# Parse the SSML payload
129+
try:
130+
ssml_data = request.data.decode('utf-8')
131+
if not ssml_data:
132+
return jsonify({"error": "Missing SSML payload"}), 400
133+
134+
# Extract the text and voice from SSML
135+
from xml.etree import ElementTree as ET
136+
root = ET.fromstring(ssml_data)
137+
text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
138+
voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
139+
except Exception as e:
140+
return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400
141+
142+
# Use default settings for edge-tts
143+
response_format = 'mp3'
144+
speed = DEFAULT_SPEED
145+
146+
if not REMOVE_FILTER:
147+
text = prepare_tts_input_with_context(text)
148+
149+
# Generate speech using edge-tts
150+
try:
151+
output_file_path = generate_speech(text, voice, response_format, speed)
152+
except Exception as e:
153+
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
154+
155+
# Return the generated audio file
156+
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
157+
66158
print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
67159
print(f" ")
68160
print(f" * Serving OpenAI Edge TTS")

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@ services:
1313
DEFAULT_SPEED: ${DEFAULT_SPEED:-1.2}
1414
DEFAULT_LANGUAGE: ${DEFAULT_LANGUAGE:-en-US}
1515
REQUIRE_API_KEY: ${REQUIRE_API_KEY:-True}
16+
REMOVE_FILTER: ${REMOVE_FILTER:-False}
17+
EXPAND_API: ${EXPAND_API:-True}

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
flask
22
gevent
33
python-dotenv
4-
edge-tts
4+
edge-tts
5+
emoji

0 commit comments

Comments
 (0)