spell_jam: refactor for switchable TTS backend

RetiredWizard · RetiredWizard · commit 81cbdc28f2f4 · 2025-10-09T18:14:35.000-04:00
diff --git a/Fruit_Jam/Fruit_Jam_Spell_Jam/aws_polly.py b/Fruit_Jam/Fruit_Jam_Spell_Jam/aws_polly.py
@@ -199,7 +199,7 @@ def text_to_speech_polly_http(
     text,
     access_key,
     secret_key,
-    output_file="/saves/awspollyoutput.mp3",
+    output_file="/saves/tts_output.mp3",
     voice_id="Joanna",
     region="us-east-1",
     output_format="mp3",
diff --git a/Fruit_Jam/Fruit_Jam_Spell_Jam/code.py b/Fruit_Jam/Fruit_Jam_Spell_Jam/code.py
@@ -3,8 +3,8 @@
 import os
 import sys
 import time
-
 import supervisor
+
 from adafruit_fruitjam import FruitJam
 from adafruit_fruitjam.peripherals import request_display_config
 import adafruit_connection_manager
@@ -13,10 +13,16 @@
 from adafruit_bitmap_font import bitmap_font
 from adafruit_display_text.bitmap_label import Label
 
-from aws_polly import text_to_speech_polly_http
-
 from launcher_config import LauncherConfig
 
+# If tts_local.py exists, use that instead of tts_aws.py
+try:
+    # tts_local defines WordFetcherTTS for TTS engine running on local network server
+    from tts_local import WordFetcherTTS
+except ImportError:
+    from tts_aws import WordFetcherTTS
+
+# read the user settings
 launcher_config = LauncherConfig()
 
 # constants
@@ -60,55 +66,26 @@
 
 fj.neopixels.brightness = 0.1
 
-# AWS auth requires us to have accurate date/time
-now = fj.sync_time()
-
-# setup adafruit_requests session
-# pylint: disable=protected-access
-pool = adafruit_connection_manager.get_radio_socketpool(fj.network._wifi.esp)
-ssl_context = adafruit_connection_manager.get_radio_ssl_context(fj.network._wifi.esp)
-requests = adafruit_requests.Session(pool, ssl_context)
-
-# read AWS keys from settings.toml
-AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
-AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
-
-
-def fetch_word(word, voice="Joanna"):
-    """
-    Fetch an MP3 saying a word from AWS Polly
-    :param word: The word to speak
-    :param voice: The AWS Polly voide ID to use
-    :return: Boolean, whether the request was successful.
-    """
-
-    if AWS_ACCESS_KEY is None or AWS_SECRET_KEY is None:
-        return False
-
-    fj.neopixels.fill(0xFFFF00)
-    success = text_to_speech_polly_http(
-        requests,
-        text=word,
-        access_key=AWS_ACCESS_KEY,
-        secret_key=AWS_SECRET_KEY,
-        voice_id=voice,
-    )
-    fj.neopixels.fill(0x00FF00)
-    return success
-
+word_fetcher = WordFetcherTTS(fj, launcher_config)
 
 def say_and_spell_lastword():
     """
     Say the last word, then spell it out one letter at a time, finally say it once more.
     """
     if sayword:
-        fj.play_mp3_file("/saves/awspollyoutput.mp3")
+        if word_fetcher.output_path[-4:] == ".mp3":
+            fj.play_mp3_file(word_fetcher.output_path)
+        elif word_fetcher.output_path[-4:] == ".wav":
+            fj.play_file(word_fetcher.output_path)
         time.sleep(0.2)
     for letter in lastword:
         fj.play_mp3_file(f"spell_jam_assets/letter_mp3s/{letter.upper()}.mp3")
     time.sleep(0.2)
     if sayword:
-        fj.play_mp3_file("/saves/awspollyoutput.mp3")
+        if word_fetcher.output_path[-4:] == ".mp3":
+            fj.play_mp3_file(word_fetcher.output_path)
+        elif word_fetcher.output_path[-4:] == ".wav":
+            fj.play_file(word_fetcher.output_path)
     fj.neopixels.fill(0x000000)
 
 
@@ -133,7 +110,7 @@ def say_and_spell_lastword():
         elif c == "\n":
             if curword:
                 lastword = curword
-                sayword = fetch_word(lastword)
+                sayword = word_fetcher.fetch_word(lastword)
                 say_and_spell_lastword()
                 curword = ""
             else:
diff --git a/Fruit_Jam/Fruit_Jam_Spell_Jam/tts_aws.py b/Fruit_Jam/Fruit_Jam_Spell_Jam/tts_aws.py
@@ -0,0 +1,45 @@
+# tts_aws.py
+import os
+import adafruit_connection_manager
+import adafruit_requests
+from aws_polly import text_to_speech_polly_http
+
+class WordFetcherTTS():
+    def __init__(self, fj=None, launcher_config=None, output_path="/saves/tts_output.mp3"):
+
+        self.output_path = output_path
+        self.fj = fj
+        self.launcher_config = launcher_config
+
+        # AWS auth requires us to have accurate date/time
+        now = fj.sync_time()
+
+        # setup adafruit_requests session
+        pool = adafruit_connection_manager.get_radio_socketpool(fj.network._wifi.esp)
+        ssl_context = adafruit_connection_manager.get_radio_ssl_context(fj.network._wifi.esp)
+        self.requests = adafruit_requests.Session(pool, ssl_context)
+        self.AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
+        self.AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
+
+    def fetch_word(self, word: str, voice: str = "Joanna") -> bool:
+        if not self.AWS_ACCESS_KEY or not self.AWS_SECRET_KEY:
+            print("Missing AWS credentials.")
+            return False
+
+        if self.fj:
+            self.fj.neopixels.fill(0xFFFF00)
+
+        success = text_to_speech_polly_http(
+            self.requests,
+            text=word,
+            access_key=self.AWS_ACCESS_KEY,
+            secret_key=self.AWS_SECRET_KEY,
+            output_file=self.output_path,
+            voice_id=voice,
+            region="us-east-1",
+            output_format="mp3",
+        )
+
+        if self.fj:
+            self.fj.neopixels.fill(0x00FF00)
+        return success
diff --git a/Fruit_Jam/Fruit_Jam_Spell_Jam/tts_local.py b/Fruit_Jam/Fruit_Jam_Spell_Jam/tts_local.py
@@ -0,0 +1,104 @@
+# tts_kani.py
+import json
+import adafruit_connection_manager
+import adafruit_requests
+
+class WordFetcherTTS():
+    def __init__(self, fj=None, launcher_config=None, output_path="/saves/tts_output.wav"):
+
+        self.output_path = output_path
+        self.launcher_config = launcher_config
+        self.fj = fj
+
+        # AWS auth requires us to have accurate date/time
+        now = fj.sync_time()
+
+        # setup adafruit_requests session
+        pool = adafruit_connection_manager.get_radio_socketpool(fj.network._wifi.esp)
+        self.requests = adafruit_requests.Session(pool)
+
+    def fetch_word(self, word: str, voice: str = "katie") -> bool:
+
+        if self.fj:
+            self.fj.neopixels.fill(0xFFFF00)
+
+        audio_data = self.text_to_speech_http(
+            text=word,
+            voice_id=voice,
+        )
+
+        success = False
+        if audio_data:
+            # Save to file
+            try:
+                with open(self.output_path, "wb") as f:
+                    f.write(audio_data)
+                print(f"Audio saved to: {self.output_path}")
+                success = True
+            except Exception as e: # pylint: disable=broad-except
+                print(f"Failed to save file: {e}")
+                success = False
+        else:
+            print("Failed to synthesize speech")
+            success = False
+
+        if self.fj:
+            self.fj.neopixels.fill(0x00FF00)
+        return success
+
+    def text_to_speech_http(
+        self,
+        text,
+        voice_id,
+    ):
+        """
+        Simple function to convert text to speech using kani-tts AI local server.py HTTP API
+
+        Args:
+            text (str): Text to convert
+            voice_id (str): voice ID
+
+        Returns:
+            bool: True if successful, False otherwise
+        """
+
+        # Prepare request
+        print(self.launcher_config.data)
+        endpoint = ""
+        if self.launcher_config and "spell_jam" in self.launcher_config.data:
+            endpoint = self.launcher_config.data["spell_jam"].get("tts_server_endpoint","")
+        if endpoint == "":
+            print("tts_server_endpoint not configured in launcher.conf.json.")
+            return None
+
+        method = "POST"
+        uri = "/tts"
+
+        # Create request body
+        request_body = {
+            "text": f'{voice_id}: {text}',
+            "temperature": 0.4,
+            "max_tokens": 400,
+            "top_p": 0.95,
+            "chunk_size": 25,
+            "lookback_frames": 15
+        }
+        payload = json.dumps(request_body)
+        url = f"{endpoint}{uri}"
+        headers = {"Content-Type": "application/json"}
+        print(f"Making request to: {url}, headers: {headers}, payload: {payload}")
+
+        try:
+            response = self.requests.post(url, headers=headers, data=payload)
+
+            if response.status_code == 200:
+                return response.content
+            else:
+                print(f"Error: HTTP {response.status_code}")
+                print(f"Response: {response.text}")
+                return None
+
+        except Exception as e:  # pylint: disable=broad-except
+            print(f"Request failed: {e}")
+            return None
+