Skip to content

Commit f534c46

Browse files
Kostis-S-Zdaavoo
andauthored
Replace Oute model with stable kokoro package (#100)
* Remove local kokoro implementation * Remove local kokoro implementation * Add WIP implementation * Replace oute for kokoro in demo app * Remove oute support from cli * Use kokoro values as default in config * Update docs * Update pyproject.toml * Update notebook * Remove oute from model_loaders.py * Remove oute from text_to_speech.py * Update tests * Fix attribute call * Update tests * Fix mkdocs * Add soundfile to dependencies Co-authored-by: David de la Iglesia Castro <daviddelaiglesiacastro@gmail.com> * Load kokoro model after setting speaker profiles * Update url in colab * Update colab with new kokoro --------- Co-authored-by: David de la Iglesia Castro <daviddelaiglesiacastro@gmail.com>
1 parent 7bb48ae commit f534c46

File tree

18 files changed

+173
-1589
lines changed

18 files changed

+173
-1589
lines changed

demo/app.py

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Streamlit app for converting documents to podcasts."""
22

33
import io
4-
import os
54
import re
65
from pathlib import Path
76

@@ -28,11 +27,8 @@ def load_text_to_text_model():
2827

2928

3029
@st.cache_resource
31-
def load_text_to_speech_model():
32-
if os.environ.get("HF_SPACE") == "TRUE":
33-
return load_tts_model("hexgrad/kLegacy/v0.19/kokoro-v0_19.pth")
34-
else:
35-
return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
30+
def load_text_to_speech_model(lang_code: str):
31+
return load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": lang_code})
3632

3733

3834
def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,29 +111,11 @@ def gen_button_clicked():
115111
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
116112
)
117113
st.divider()
114+
tts_link = "- [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
115+
116+
SPEAKERS = DEFAULT_SPEAKERS
118117

119118
text_model = load_text_to_text_model()
120-
speech_model = load_text_to_speech_model()
121-
122-
if os.environ.get("HF_SPACE") == "TRUE":
123-
tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
124-
SPEAKERS = [
125-
{
126-
"id": 1,
127-
"name": "Sarah",
128-
"description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
129-
"voice_profile": "af_sarah",
130-
},
131-
{
132-
"id": 2,
133-
"name": "Michael",
134-
"description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
135-
"voice_profile": "am_michael",
136-
},
137-
]
138-
else:
139-
tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
140-
SPEAKERS = DEFAULT_SPEAKERS
141119

142120
st.markdown(
143121
"For this demo, we are using the following models: \n"
@@ -180,6 +158,15 @@ def gen_button_clicked():
180158
speaker.get(x, None) for x in ["name", "description", "voice_profile"]
181159
)
182160
)
161+
if speakers[0]["voice_profile"][0] != speakers[1]["voice_profile"][0]:
162+
raise ValueError(
163+
"Both Kokoro speakers need to have the same language code. "
164+
"More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md"
165+
)
166+
# Get which language is used for generation from the first character of the Kokoro voice profile
167+
language_code = speakers[0]["voice_profile"][0]
168+
speech_model = load_text_to_speech_model(lang_code=language_code)
169+
183170
system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
184171
with st.spinner("Generating Podcast..."):
185172
text = ""

demo/notebook.ipynb

Lines changed: 80 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"outputs": [],
8181
"source": [
8282
"%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl\n",
83-
"%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git@text-to-speech-model\n",
83+
"%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git\n",
8484
"%pip install --quiet phonemizer"
8585
]
8686
},
@@ -163,68 +163,13 @@
163163
"print(clean_text[:200])"
164164
]
165165
},
166-
{
167-
"cell_type": "markdown",
168-
"metadata": {},
169-
"source": [
170-
"## Downloading and loading models"
171-
]
172-
},
173166
{
174167
"cell_type": "markdown",
175168
"metadata": {},
176169
"source": [
177170
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
178171
]
179172
},
180-
{
181-
"cell_type": "markdown",
182-
"metadata": {},
183-
"source": [
184-
"For this demo, we are using the following models:\n",
185-
" - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
186-
" - [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
187-
]
188-
},
189-
{
190-
"cell_type": "markdown",
191-
"metadata": {},
192-
"source": [
193-
"You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/) for more information on how to use different models."
194-
]
195-
},
196-
{
197-
"cell_type": "code",
198-
"execution_count": null,
199-
"metadata": {},
200-
"outputs": [],
201-
"source": [
202-
"from document_to_podcast.inference.model_loaders import (\n",
203-
" load_llama_cpp_model,\n",
204-
" load_tts_model,\n",
205-
")\n",
206-
"\n",
207-
"text_model = load_llama_cpp_model(\n",
208-
" \"bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf\"\n",
209-
")\n",
210-
"speech_model = load_tts_model(\"hexgrad/kLegacy/v0.19/kokoro-v0_19.pth\")"
211-
]
212-
},
213-
{
214-
"cell_type": "code",
215-
"execution_count": null,
216-
"metadata": {},
217-
"outputs": [],
218-
"source": [
219-
"max_characters = text_model.n_ctx() * 4\n",
220-
"if len(clean_text) > max_characters:\n",
221-
" print(\n",
222-
" f\"Input text is too big ({len(clean_text)}).\"\n",
223-
" f\" Using only a subset of it ({max_characters}).\"\n",
224-
" )\n",
225-
" clean_text = clean_text[:max_characters]"
226-
]
227-
},
228173
{
229174
"cell_type": "markdown",
230175
"metadata": {},
@@ -310,6 +255,69 @@
310255
"print(system_prompt)"
311256
]
312257
},
258+
{
259+
"cell_type": "markdown",
260+
"metadata": {},
261+
"source": [
262+
"## Downloading and loading models"
263+
]
264+
},
265+
{
266+
"cell_type": "markdown",
267+
"metadata": {},
268+
"source": [
269+
"For this demo, we are using the following models:\n",
270+
" - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
271+
" - [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
272+
]
273+
},
274+
{
275+
"cell_type": "markdown",
276+
"metadata": {},
277+
"source": [
278+
"You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/) for more information on how to use different models."
279+
]
280+
},
281+
{
282+
"cell_type": "code",
283+
"execution_count": null,
284+
"metadata": {},
285+
"outputs": [],
286+
"source": [
287+
"from document_to_podcast.inference.model_loaders import (\n",
288+
" load_llama_cpp_model,\n",
289+
" load_tts_model,\n",
290+
")\n",
291+
"\n",
292+
"if speakers[0][\"voice_profile\"][0] != speakers[1][\"voice_profile\"][0]:\n",
293+
" raise ValueError(\n",
294+
" \"Both Kokoro speakers need to have the same language code. \"\n",
295+
" \"More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md\"\n",
296+
" )\n",
297+
"# Get which language is used for generation from the first character of the Kokoro voice profile\n",
298+
"language_code = speakers[0][\"voice_profile\"][0]\n",
299+
"\n",
300+
"text_model = load_llama_cpp_model(\n",
301+
" \"bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf\"\n",
302+
")\n",
303+
"speech_model = load_tts_model(\"hexgrad/Kokoro-82M\", **{\"lang_code\": language_code})"
304+
]
305+
},
306+
{
307+
"cell_type": "code",
308+
"execution_count": null,
309+
"metadata": {},
310+
"outputs": [],
311+
"source": [
312+
"max_characters = text_model.n_ctx() * 4\n",
313+
"if len(clean_text) > max_characters:\n",
314+
" print(\n",
315+
" f\"Input text is too big ({len(clean_text)}).\"\n",
316+
" f\" Using only a subset of it ({max_characters}).\"\n",
317+
" )\n",
318+
" clean_text = clean_text[:max_characters]"
319+
]
320+
},
313321
{
314322
"cell_type": "markdown",
315323
"metadata": {},
@@ -405,10 +413,24 @@
405413
}
406414
],
407415
"metadata": {
416+
"kernelspec": {
417+
"display_name": "Python 3 (ipykernel)",
418+
"language": "python",
419+
"name": "python3"
420+
},
408421
"language_info": {
409-
"name": "python"
422+
"codemirror_mode": {
423+
"name": "ipython",
424+
"version": 3
425+
},
426+
"file_extension": ".py",
427+
"mimetype": "text/x-python",
428+
"name": "python",
429+
"nbconvert_exporter": "python",
430+
"pygments_lexer": "ipython3",
431+
"version": "3.12.3"
410432
}
411433
},
412434
"nbformat": 4,
413-
"nbformat_minor": 0
435+
"nbformat_minor": 4
414436
}

docs/api.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# API Reference
22

3+
::: document_to_podcast.preprocessing.data_loaders
4+
35
::: document_to_podcast.preprocessing.data_cleaners
46

57
::: document_to_podcast.inference.model_loaders

docs/customization.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ For example: `Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf`.
2222
The model used to generate the audio from the podcast script.
2323

2424
You can use any of the models listed in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) out of the box.
25-
We currently support [OuteTTS](https://github.com/edwko/OuteTTS).
25+
We currently support [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M).
2626

27-
If you want to use a different model, you can integrate it by implementing the `_load` and `_text_to_speech` functions and registering them in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) and [`TTS_INFERENCE`](api.md/#document_to_podcast.inference.model_loaders.TTS_INFERENCE).
27+
If you want to use a different model, you can integrate it by implementing the `_load` and `_text_to_speech` functions and registering them in [`TTS_LOADERS`](api.md/#document_to_podcast.inference.model_loaders.TTS_LOADERS) and [`TTS_INFERENCE`](api.md/#document_to_podcast.inference.text_to_speech.TTS_INFERENCE).
2828
You can check [this repo](https://github.com/Kostis-S-Z/document-to-podcast/) where different text-to-speech models are integrated.
2929

3030
## 🖋️ **Other Customizable Parameters**

docs/step-by-step-guide.md

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -162,34 +162,30 @@ In this final step, the generated podcast transcript is brought to life as an au
162162

163163
- The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-text` and `text-to-speech` models.
164164

165-
- The function `load_outetts_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model, either on CPU or GPU, based on the `device` parameter. The parameter `language` also enables to swap between the languages the Oute package supports (as of Dec 2024: `en, zh, ja, ko`)
166-
167165

168166
**2 - Text-to-Speech Audio Generation**
169167

170168
- The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model.
171169

172-
- A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Oute models require one of the IDs specified [here](https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers).
170+
- A **voice profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Kokoro models require one of the IDs specified [here](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md).
173171

174172
- The function `text_to_speech` takes the input text (e.g. podcast script) and speaker profile, generating a waveform (audio data in a numpy array) that represents the spoken version of the text.
175173

176174
### 🔍 **API Example**
177175

178176
```py
179177
import soundfile as sf
180-
from document_to_podcast.inference.model_loaders import load_outetts_model
178+
from document_to_podcast.inference.model_loaders import load_tts_model
181179
from document_to_podcast.inference.text_to_speech import text_to_speech
182180

183181
# Load the TTS model
184-
model = load_outetts_model(
185-
"OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf"
186-
)
182+
model = load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": 'a'})
187183

188184
# Generate the waveform
189185
waveform = text_to_speech(
190186
input_text="Welcome to our amazing podcast",
191187
model=model,
192-
voice_profile="male_1"
188+
voice_profile="af_sarah"
193189
)
194190

195191
# Save the audio file
@@ -227,7 +223,7 @@ This demo uses [Streamlit](https://streamlit.io/), an open-source Python framewo
227223

228224
- The script uses `load_llama_cpp_model` from `model_loader.py` to load the LLM for generating the podcast script.
229225

230-
- Similarly, `load_outetts_model` is used to prepare the TTS model and tokenizer for audio generation.
226+
- Similarly, `load_tts_model` is used to prepare the TTS model and tokenizer for audio generation.
231227

232228
- These models are cached using `@st.cache_resource` to ensure fast and efficient reuse during app interactions.
233229

example_data/config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html"
22
output_folder: "example_data/"
3-
text_to_text_model: "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
4-
text_to_speech_model: "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf"
3+
text_to_text_model: "bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
4+
text_to_speech_model: "hexgrad/Kokoro-82M"
55
text_to_text_prompt: |
66
You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format.
77
The script features the following speakers:
@@ -20,10 +20,10 @@ text_to_text_prompt: |
2020
}
2121
speakers:
2222
- id: 1
23-
name: Laura
23+
name: Sarah
2424
description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
25-
voice_profile: female_1
25+
voice_profile: af_sarah
2626
- id: 2
27-
name: Jon
27+
name: Michael
2828
description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
29-
voice_profile: male_1
29+
voice_profile: am_michael

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@ dependencies = [
1212
"beautifulsoup4",
1313
"fire",
1414
"huggingface-hub",
15+
"kokoro>=0.7.9",
1516
"llama-cpp-python",
1617
"loguru",
17-
"outetts<0.3",
1818
"pydantic",
1919
"PyPDF2[crypto]",
2020
"python-docx",
21+
"soundfile",
2122
"streamlit",
2223
]
2324

0 commit comments

Comments
 (0)