hotfix: use legacy kokoro repo for demo

Kostis-S-Z · Kostis-S-Z · commit f79d2db566b2 · 2025-01-28T17:41:45.000-03:00
diff --git a/demo/app.py b/demo/app.py
@@ -30,7 +30,7 @@ def load_text_to_text_model():
 @st.cache_resource
 def load_text_to_speech_model():
     if os.environ.get("HF_SPACE") == "TRUE":
-        return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
+        return load_tts_model("hexgrad/kLegacy/v0.19/kokoro-v0_19.pth")
     else:
         return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
@@ -137,7 +137,7 @@ def gen_button_clicked():
         ]
     else:
         tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
-        SPEARES = DEFAULT_SPEAKERS
+        SPEAKERS = DEFAULT_SPEAKERS
 
     st.markdown(
         "For this demo, we are using the following models: \n"
diff --git a/demo/notebook.ipynb b/demo/notebook.ipynb
@@ -207,7 +207,7 @@
     "text_model = load_llama_cpp_model(\n",
     "    \"bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf\"\n",
     ")\n",
-    "speech_model = load_tts_model(\"hexgrad/Kokoro-82M/kokoro-v0_19.pth\")"
+    "speech_model = load_tts_model(\"hexgrad/kLegacy/v0.19/kokoro-v0_19.pth\")"
    ]
   },
   {
diff --git a/src/document_to_podcast/inference/model_loaders.py b/src/document_to_podcast/inference/model_loaders.py
@@ -81,28 +81,25 @@ def _load_oute_tts(model_id: str, **kwargs) -> TTSModel:
     )
 
 
-def _load_kokoro_tts(model_id: str, **kwargs) -> TTSModel:
+def _load_kokoro_legacy_tts(model_id: str, **kwargs) -> TTSModel:
     from document_to_podcast.inference.kokoro.models import build_model
 
-    org, repo, filename = model_id.split("/")
-    downloaded_model = hf_hub_download(f"{org}/{repo}", filename)
+    org, repo, kokoro_version, filename = model_id.split("/")
+    downloaded_model = hf_hub_download(f"{org}/{repo}", f"{kokoro_version}/{filename}")
     model = build_model(downloaded_model)
     return TTSModel(
         model=model,
         model_id=model_id,
         sample_rate=24000,
-        custom_args={
-            "org": org,
-            "repo": repo,
-        },
+        custom_args={"org": org, "repo": repo, "kokoro_version": kokoro_version},
     )
 
 
 TTS_LOADERS = {
     # To add support for your model, add it here in the format {model_id} : _load_function
     "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf": _load_oute_tts,
     "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf": _load_oute_tts,
-    "hexgrad/Kokoro-82M/kokoro-v0_19.pth": _load_kokoro_tts,
+    "hexgrad/kLegacy/v0.19/kokoro-v0_19.pth": _load_kokoro_legacy_tts,
 }
 
 
diff --git a/src/document_to_podcast/inference/text_to_speech.py b/src/document_to_podcast/inference/text_to_speech.py
@@ -40,12 +40,14 @@ def _text_to_speech_oute(
     return output_as_np
 
 
-def _text_to_speech_kokoro(input_text, model, voice_profile, org, repo):
+def _text_to_speech_kokoro(input_text, model, voice_profile, org, repo, kokoro_version):
     import torch
     from huggingface_hub import hf_hub_download
     from document_to_podcast.inference.kokoro.infer import generate
 
-    downloaded_voice = hf_hub_download(f"{org}/{repo}", f"voices/{voice_profile}.pt")
+    downloaded_voice = hf_hub_download(
+        f"{org}/{repo}", f"{kokoro_version}/voices/{voice_profile}.pt"
+    )
     voicepack = torch.load(downloaded_voice).to(
         torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )
@@ -58,7 +60,7 @@ def _text_to_speech_kokoro(input_text, model, voice_profile, org, repo):
     # To add support for your model, add it here in the format {model_id} : _inference_function
     "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf": _text_to_speech_oute,
     "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf": _text_to_speech_oute,
-    "hexgrad/Kokoro-82M/kokoro-v0_19.pth": _text_to_speech_kokoro,
+    "hexgrad/kLegacy/v0.19/kokoro-v0_19.pth": _text_to_speech_kokoro,
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@`
`207`	`207`	`"text_model = load_llama_cpp_model(\n",`
`208`	`208`	`" \"bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf\"\n",`
`209`	`209`	`")\n",`
`210`		`- "speech_model = load_tts_model(\"hexgrad/Kokoro-82M/kokoro-v0_19.pth\")"`
	`210`	`+ "speech_model = load_tts_model(\"hexgrad/kLegacy/v0.19/kokoro-v0_19.pth\")"`
`211`	`211`	`]`
`212`	`212`	`},`
`213`	`213`	`{`