mozilla-ai · daavoo · Feb 11, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,7 +1,7 @@
 {
     "hostRequirements": {
      "cpus": 4,
-     "memory": "16gb",
+     "memory": "8gb",
      "storage": "32gb"
     },
     "name": "Python Development Container",

diff --git a/README.md b/README.md
@@ -79,6 +79,11 @@ pip install -e .
 python -m streamlit run demo/app.py
 ```
 
+## System requirements
+  - OS: Windows, macOS, or Linux
+  - Python 3.10+ / 3.12+ for Apple M chips
+  - Minimum RAM: 8 GB
+  - Disk space: 20 GB minimum
 
 ## License
 

diff --git a/demo/app.py b/demo/app.py
@@ -1,7 +1,6 @@
 """Streamlit app for converting documents to podcasts."""
 
 import io
-import os
 import re
 from pathlib import Path
 
@@ -28,11 +27,8 @@ def load_text_to_text_model():
 
 
 @st.cache_resource
-def load_text_to_speech_model():
-    if os.environ.get("HF_SPACE") == "TRUE":
-        return load_tts_model("hexgrad/kLegacy/v0.19/kokoro-v0_19.pth")
-    else:
-        return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+def load_text_to_speech_model(lang_code: str):
+    return load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": lang_code})
 
 
 def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,29 +111,11 @@ def gen_button_clicked():
         "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
     )
     st.divider()
+    tts_link = "- [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
+
+    SPEAKERS = DEFAULT_SPEAKERS
 
     text_model = load_text_to_text_model()
-    speech_model = load_text_to_speech_model()
-
-    if os.environ.get("HF_SPACE") == "TRUE":
-        tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
-        SPEAKERS = [
-            {
-                "id": 1,
-                "name": "Sarah",
-                "description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
-                "voice_profile": "af_sarah",
-            },
-            {
-                "id": 2,
-                "name": "Michael",
-                "description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
-                "voice_profile": "am_michael",
-            },
-        ]
-    else:
-        tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
-        SPEAKERS = DEFAULT_SPEAKERS
 
     st.markdown(
         "For this demo, we are using the following models: \n"
@@ -180,6 +158,15 @@ def gen_button_clicked():
                 speaker.get(x, None) for x in ["name", "description", "voice_profile"]
             )
         )
+        if speakers[0]["voice_profile"][0] != speakers[1]["voice_profile"][0]:
+            raise ValueError(
+                "Both Kokoro speakers need to have the same language code. "
+                "More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md"
+            )
+        # Get which language is used for generation from the first character of the Kokoro voice profile
+        language_code = speakers[0]["voice_profile"][0]
+        speech_model = load_text_to_speech_model(lang_code=language_code)
+
         system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
         with st.spinner("Generating Podcast..."):
             text = ""

diff --git a/demo/notebook.ipynb b/demo/notebook.ipynb
@@ -80,7 +80,7 @@
    "outputs": [],
    "source": [
     "%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl\n",
-    "%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git@text-to-speech-model\n",
+    "%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git\n",
     "%pip install --quiet phonemizer"
    ]
   },
@@ -163,68 +163,13 @@
     "print(clean_text[:200])"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Downloading and loading models"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For this demo, we are using the following models:\n",
-    "  - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
-    "  - [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/) for more information on how to use different models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from document_to_podcast.inference.model_loaders import (\n",
-    "    load_llama_cpp_model,\n",
-    "    load_tts_model,\n",
-    ")\n",
-    "\n",
-    "text_model = load_llama_cpp_model(\n",
-    "    \"bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf\"\n",
-    ")\n",
-    "speech_model = load_tts_model(\"hexgrad/kLegacy/v0.19/kokoro-v0_19.pth\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "max_characters = text_model.n_ctx() * 4\n",
-    "if len(clean_text) > max_characters:\n",
-    "    print(\n",
-    "        f\"Input text is too big ({len(clean_text)}).\"\n",
-    "        f\" Using only a subset of it ({max_characters}).\"\n",
-    "    )\n",
-    "    clean_text = clean_text[:max_characters]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -310,6 +255,69 @@
     "print(system_prompt)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downloading and loading models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this demo, we are using the following models:\n",
+    "  - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
+    "  - [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/) for more information on how to use different models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from document_to_podcast.inference.model_loaders import (\n",
+    "    load_llama_cpp_model,\n",
+    "    load_tts_model,\n",
+    ")\n",
+    "\n",
+    "if speakers[0][\"voice_profile\"][0] != speakers[1][\"voice_profile\"][0]:\n",
+    "    raise ValueError(\n",
+    "        \"Both Kokoro speakers need to have the same language code. \"\n",
+    "        \"More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md\"\n",
+    "    )\n",
+    "# Get which language is used for generation from the first character of the Kokoro voice profile\n",
+    "language_code = speakers[0][\"voice_profile\"][0]\n",
+    "\n",
+    "text_model = load_llama_cpp_model(\n",
+    "    \"bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf\"\n",
+    ")\n",
+    "speech_model = load_tts_model(\"hexgrad/Kokoro-82M\", **{\"lang_code\": language_code})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_characters = text_model.n_ctx() * 4\n",
+    "if len(clean_text) > max_characters:\n",
+    "    print(\n",
+    "        f\"Input text is too big ({len(clean_text)}).\"\n",
+    "        f\" Using only a subset of it ({max_characters}).\"\n",
+    "    )\n",
+    "    clean_text = clean_text[:max_characters]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -405,10 +413,24 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 4
 }
diff --git a/docs/api.md b/docs/api.md
@@ -1,5 +1,7 @@
 # API Reference
 
+::: document_to_podcast.preprocessing.data_loaders
+
 ::: document_to_podcast.preprocessing.data_cleaners
 
 ::: document_to_podcast.inference.model_loaders

diff --git a/docs/customization.md b/docs/customization.md
@@ -22,9 +22,9 @@ For example: `Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf`.
 The model used to generate the audio from the podcast script.
 
 You can use any of the models listed in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) out of the box.
-We currently support [OuteTTS](https://github.com/edwko/OuteTTS).
+We currently support [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M).
 
-If you want to use a different model, you can integrate it by implementing the `_load` and `_text_to_speech` functions and registering them in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) and [`TTS_INFERENCE`](api.md/#document_to_podcast.inference.model_loaders.TTS_INFERENCE).
+If you want to use a different model, you can integrate it by implementing the `_load` and `_text_to_speech` functions and registering them in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) and [`TTS_INFERENCE`](api.md/#document_to_podcast.inference.text_to_speech.TTS_INFERENCE).
 You can check [this repo](https://github.com/Kostis-S-Z/document-to-podcast/) where different text-to-speech models are integrated.
 
 ## 🖋️ **Other Customizable Parameters**

diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md
@@ -162,34 +162,30 @@ In this final step, the generated podcast transcript is brought to life as an au
 
    - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-text` and `text-to-speech` models.
 
-   - The function `load_outetts_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model, either on CPU or GPU, based on the `device` parameter. The parameter `language` also enables to swap between the languages the Oute package supports (as of Dec 2024: `en, zh, ja, ko`)
-
 
 **2 - Text-to-Speech Audio Generation**
 
    - The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model.
 
-   - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Oute models require one of the IDs specified [here](https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers).
+   - A **voice profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Kokoro models require one of the IDs specified [here](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md).
 
    - The function `text_to_speech` takes the input text (e.g. podcast script) and speaker profile, generating a waveform (audio data in a numpy array) that represents the spoken version of the text.
 
 ### 🔍 **API Example**
 
 ```py
 import soundfile as sf
-from document_to_podcast.inference.model_loaders import load_outetts_model
+from document_to_podcast.inference.model_loaders import load_tts_model
 from document_to_podcast.inference.text_to_speech import text_to_speech
 
 # Load the TTS model
-model = load_outetts_model(
-    "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf"
-)
+model = load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": 'a'})
 
 # Generate the waveform
 waveform = text_to_speech(
     input_text="Welcome to our amazing podcast",
     model=model,
-    voice_profile="male_1"
+    voice_profile="af_sarah"
 )
 
 # Save the audio file
@@ -227,7 +223,7 @@ This demo uses [Streamlit](https://streamlit.io/), an open-source Python framewo
 
 - The script uses `load_llama_cpp_model` from `model_loader.py` to load the LLM for generating the podcast script.
 
-- Similarly, `load_outetts_model` is used to prepare the TTS model and tokenizer for audio generation.
+- Similarly, `load_tts_model` is used to prepare the TTS model and tokenizer for audio generation.
 
 - These models are cached using `@st.cache_resource` to ensure fast and efficient reuse during app interactions.
 

diff --git a/example_data/config.yaml b/example_data/config.yaml
@@ -1,7 +1,7 @@
 input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html"
 output_folder: "example_data/"
-text_to_text_model: "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
-text_to_speech_model: "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf"
+text_to_text_model: "bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
+text_to_speech_model: "hexgrad/Kokoro-82M"
 text_to_text_prompt: |
   You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
   The script features the following speakers:
@@ -20,10 +20,10 @@ text_to_text_prompt: |
   }
 speakers:
   - id: 1
-    name: Laura
+    name: Sarah
     description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
-    voice_profile: female_1
+    voice_profile: af_sarah
   - id: 2
-    name: Jon
+    name: Michael
     description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
-    voice_profile: male_1
+    voice_profile: am_michael
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,12 +12,13 @@ dependencies = [
   "beautifulsoup4",
   "fire",
   "huggingface-hub",
+  "kokoro>=0.7.9",
   "llama-cpp-python",
   "loguru",
-  "outetts<0.3",
   "pydantic",
   "PyPDF2[crypto]",
   "python-docx",
+  "soundfile",
   "streamlit",
 ]