Added filtering for markdown with option to disable, added support for ElevenLabs and Azure AI Speech endpoints (currently in beta) (option to disable this expansion), and added option to not include '/v1' in the API endpoint to make setup easier

travisvn · travisvn · commit 0f64d018c30a · 2024-12-27T21:47:58.000-05:00
diff --git a/.env.example b/.env.example
@@ -7,4 +7,8 @@ DEFAULT_SPEED=1.2
 
 DEFAULT_LANGUAGE=en-US
 
-REQUIRE_API_KEY=True
+REQUIRE_API_KEY=True
+
+REMOVE_FILTER=False
+
+EXPAND_API=True
diff --git a/app/handle_text.py b/app/handle_text.py
@@ -0,0 +1,62 @@
+import re
+import emoji
+
+def prepare_tts_input_with_context(text: str) -> str:
+    """
+    Prepares text for a TTS API by cleaning Markdown and adding minimal contextual hints
+    for certain Markdown elements like headers. Preserves paragraph separation.
+
+    Args:
+        text (str): The raw text containing Markdown or other formatting.
+
+    Returns:
+        str: Cleaned text with contextual hints suitable for TTS input.
+    """
+
+    # Remove emojis
+    text = emoji.replace_emoji(text, replace='')
+
+    # Add context for headers
+    def header_replacer(match):
+        level = len(match.group(1))  # Number of '#' symbols
+        header_text = match.group(2).strip()
+        if level == 1:
+            return f"Title — {header_text}\n"
+        elif level == 2:
+            return f"Section — {header_text}\n"
+        else:
+            return f"Subsection — {header_text}\n"
+
+    text = re.sub(r"^(#{1,6})\s+(.*)", header_replacer, text, flags=re.MULTILINE)
+
+    # Announce links (currently commented out for potential future use)
+    # text = re.sub(r"\[([^\]]+)\]\((https?:\/\/[^\)]+)\)", r"\1 (link: \2)", text)
+
+    # Remove links while keeping the link text
+    text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
+
+    # Describe inline code
+    text = re.sub(r"`([^`]+)`", r"code snippet: \1", text)
+
+    # Remove bold/italic symbols but keep the content
+    text = re.sub(r"(\*\*|__|\*|_)", '', text)
+
+    # Remove code blocks (multi-line) with a description
+    text = re.sub(r"```([\s\S]+?)```", r"(code block omitted)", text)
+
+    # Remove image syntax but add alt text if available
+    text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"Image: \1", text)
+
+    # Remove HTML tags
+    text = re.sub(r"</?[^>]+(>|$)", '', text)
+
+    # Normalize line breaks
+    text = re.sub(r"\n{2,}", '\n\n', text)  # Ensure consistent paragraph separation
+
+    # Replace multiple spaces within lines
+    text = re.sub(r" {2,}", ' ', text)
+
+    # Trim leading and trailing whitespace from the whole text
+    text = text.strip()
+
+    return text
diff --git a/app/server.py b/app/server.py
@@ -5,8 +5,9 @@
 from dotenv import load_dotenv
 import os
 
+from handle_text import prepare_tts_input_with_context
 from tts_handler import generate_speech, get_models, get_voices
-from utils import require_api_key, AUDIO_FORMAT_MIME_TYPES
+from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES
 
 app = Flask(__name__)
 load_dotenv()
@@ -18,16 +19,24 @@
 DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
 DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.2))
 
+REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
+EXPAND_API = getenv_bool('EXPAND_API', True)
+
 # DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
 
 @app.route('/v1/audio/speech', methods=['POST'])
+@app.route('/audio/speech', methods=['POST'])  # Add this line for the alias
 @require_api_key
 def text_to_speech():
     data = request.json
     if not data or 'input' not in data:
         return jsonify({"error": "Missing 'input' in request body"}), 400
 
     text = data.get('input')
+
+    if not REMOVE_FILTER:
+        text = prepare_tts_input_with_context(text)
+
     # model = data.get('model', DEFAULT_MODEL)
     voice = data.get('voice', DEFAULT_VOICE)
 
@@ -43,11 +52,13 @@ def text_to_speech():
     return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")
 
 @app.route('/v1/models', methods=['GET', 'POST'])
+@app.route('/models', methods=['GET', 'POST'])
 @require_api_key
 def list_models():
     return jsonify({"data": get_models()})
 
 @app.route('/v1/voices', methods=['GET', 'POST'])
+@app.route('/voices', methods=['GET', 'POST'])
 @require_api_key
 def list_voices():
     specific_language = None
@@ -59,10 +70,91 @@ def list_voices():
     return jsonify({"voices": get_voices(specific_language)})
 
 @app.route('/v1/voices/all', methods=['GET', 'POST'])
+@app.route('/voices/all', methods=['GET', 'POST'])
 @require_api_key
 def list_all_voices():
     return jsonify({"voices": get_voices('all')})
 
+"""
+Support for ElevenLabs and Azure AI Speech
+    (currently in beta)
+"""
+
+# http://localhost:5050/elevenlabs/v1/text-to-speech
+# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural
+@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
+@require_api_key
+def elevenlabs_tts(voice_id):
+    if not EXPAND_API:
+        return jsonify({"error": f"Endpoint not allowed"}), 500
+    
+    # Parse the incoming JSON payload
+    try:
+        payload = request.json
+        if not payload or 'text' not in payload:
+            return jsonify({"error": "Missing 'text' in request body"}), 400
+    except Exception as e:
+        return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400
+
+    text = payload['text']
+
+    if not REMOVE_FILTER:
+        text = prepare_tts_input_with_context(text)
+
+    voice = voice_id  # ElevenLabs uses the voice_id in the URL
+
+    # Use default settings for edge-tts
+    response_format = 'mp3'
+    speed = DEFAULT_SPEED  # Optional customization via payload.get('speed', DEFAULT_SPEED)
+
+    # Generate speech using edge-tts
+    try:
+        output_file_path = generate_speech(text, voice, response_format, speed)
+    except Exception as e:
+        return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
+
+    # Return the generated audio file
+    return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
+
+# tts.speech.microsoft.com/cognitiveservices/v1
+# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
+# http://localhost:5050/azure/cognitiveservices/v1
+@app.route('/azure/cognitiveservices/v1', methods=['POST'])
+@require_api_key
+def azure_tts():
+    if not EXPAND_API:
+        return jsonify({"error": f"Endpoint not allowed"}), 500
+    
+    # Parse the SSML payload
+    try:
+        ssml_data = request.data.decode('utf-8')
+        if not ssml_data:
+            return jsonify({"error": "Missing SSML payload"}), 400
+
+        # Extract the text and voice from SSML
+        from xml.etree import ElementTree as ET
+        root = ET.fromstring(ssml_data)
+        text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
+        voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
+    except Exception as e:
+        return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400
+
+    # Use default settings for edge-tts
+    response_format = 'mp3'
+    speed = DEFAULT_SPEED
+
+    if not REMOVE_FILTER:
+        text = prepare_tts_input_with_context(text)
+
+    # Generate speech using edge-tts
+    try:
+        output_file_path = generate_speech(text, voice, response_format, speed)
+    except Exception as e:
+        return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
+
+    # Return the generated audio file
+    return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
+
 print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
 print(f" ")
 print(f" * Serving OpenAI Edge TTS")
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -13,3 +13,5 @@ services:
       DEFAULT_SPEED: ${DEFAULT_SPEED:-1.2}
       DEFAULT_LANGUAGE: ${DEFAULT_LANGUAGE:-en-US}
       REQUIRE_API_KEY: ${REQUIRE_API_KEY:-True}
+      REMOVE_FILTER: ${REMOVE_FILTER:-False}
+      EXPAND_API: ${EXPAND_API:-True}
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 flask
 gevent
 python-dotenv
-edge-tts
+edge-tts
+emoji