mozilla-ai · daavoo · Jan 17, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
diff --git a/demo/Dockerfile b/demo/Dockerfile
@@ -8,6 +8,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   git \
   && apt-get clean && rm -rf /var/lib/apt/lists/*
 
+RUN apt-get install espeak-ng -y
+
 RUN useradd -m -u 1000 user
 
 USER user
@@ -18,6 +20,7 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
 
 RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
+RUN pip3 install phonemizer
 RUN pip3 install document-to-podcast
 
 COPY --chown=user . $HOME/app

diff --git a/demo/app.py b/demo/app.py
@@ -1,8 +1,9 @@
 """Streamlit app for converting documents to podcasts."""
 
+import io
+import os
 import re
 from pathlib import Path
-import io
 
 import numpy as np
 import soundfile as sf
@@ -28,7 +29,10 @@ def load_text_to_text_model():
 
 @st.cache_resource
 def load_text_to_speech_model():
-    return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+    if os.environ.get("HF_SPACE") == "TRUE":
+        return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
+    else:
+        return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
 
 def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
@@ -115,10 +119,15 @@ def gen_button_clicked():
     text_model = load_text_to_text_model()
     speech_model = load_text_to_speech_model()
 
+    if os.environ.get("HF_SPACE") == "TRUE":
+        tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
+    else:
+        tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
+
     st.markdown(
         "For this demo, we are using the following models: \n"
         "- [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
-        "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
+        f"{tts_link}\n"
     )
     st.markdown(
         "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
@@ -187,7 +196,7 @@ def gen_button_clicked():
 
     if st.session_state[gen_button]:
         audio_np = stack_audio_segments(
-            st.session_state.audio, speech_model.sample_rate
+            st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
         )
         audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
         if st.download_button(

diff --git a/demo/notebook.ipynb b/demo/notebook.ipynb
@@ -79,8 +79,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl\n",
-    "%pip install --quiet document-to-podcast"
+    "%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl\n",
+    "%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git@text-to-speech-model\n",
+    "%pip install --quiet phonemizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apt-get -qq -y install espeak-ng"
    ]
   },
   {
@@ -173,7 +183,7 @@
    "source": [
     "For this demo, we are using the following models:\n",
     "  - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
-    "  - [OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
+    "  - [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
    ]
   },
   {
@@ -197,7 +207,7 @@
     "text_model = load_llama_cpp_model(\n",
     "    \"bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf\"\n",
     ")\n",
-    "speech_model = load_tts_model(\"OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf\")"
+    "speech_model = load_tts_model(\"hexgrad/Kokoro-82M/kokoro-v0_19.pth\")"
    ]
   },
   {
@@ -247,15 +257,15 @@
     "speakers = [\n",
     "    {\n",
     "        \"id\": 1,\n",
-    "        \"name\": \"Laura\",\n",
+    "        \"name\": \"Sarah\",\n",
     "        \"description\": \"The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.\",\n",
-    "        \"voice_profile\": \"female_1\",\n",
+    "        \"voice_profile\": \"af_sarah\",\n",
     "    },\n",
     "    {\n",
     "        \"id\": 2,\n",
-    "        \"name\": \"Jon\",\n",
+    "        \"name\": \"Michael\",\n",
     "        \"description\": \"The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.\",\n",
-    "        \"voice_profile\": \"male_1\",\n",
+    "        \"voice_profile\": \"am_michael\",\n",
     "    },\n",
     "]\n",
     "\n",

diff --git a/docs/index.md b/docs/index.md
@@ -11,7 +11,6 @@ These docs are your companion to mastering the **Document-to-Podcast Blueprint**
 ### Built with
 - Python 3.10+
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 

diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -52,7 +52,7 @@ def document_to_podcast(
                 - {output_folder}/podcast.txt
                 - {output_folder}/podcast.wav
 
-        text_to_text_model (str, optional): The path to the text-to-text model.
+        text_to_text_model (str, optional): The text-to-text model_id.
 
             Need to be formatted as `owner/repo/file`.
 
@@ -63,8 +63,8 @@ def document_to_podcast(
         text_to_text_prompt (str, optional): The prompt for the text-to-text model.
             Defaults to DEFAULT_PROMPT.
 
-        text_to_speech_model (str, optional): The path to the text-to-speech model.
-            Defaults to `OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf`.
+        text_to_speech_model (str, optional): The text-to-speech model_id.
+            Defaults to `OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf`.
 
         speakers (list[Speaker] | None, optional): The speakers for the podcast.
             Defaults to DEFAULT_SPEAKERS.

diff --git a/src/document_to_podcast/inference/kokoro/__init__.py b/src/document_to_podcast/inference/kokoro/__init__.py
diff --git a/src/document_to_podcast/inference/kokoro/infer.py b/src/document_to_podcast/inference/kokoro/infer.py
@@ -0,0 +1,184 @@
+import phonemizer
+import re
+import torch
+
+
+def split_num(num):
+    num = num.group()
+    if "." in num:
+        return num
+    elif ":" in num:
+        h, m = [int(n) for n in num.split(":")]
+        if m == 0:
+            return f"{h} o'clock"
+        elif m < 10:
+            return f"{h} oh {m}"
+        return f"{h} {m}"
+    year = int(num[:4])
+    if year < 1100 or year % 1000 < 10:
+        return num
+    left, right = num[:2], int(num[2:4])
+    s = "s" if num.endswith("s") else ""
+    if 100 <= year % 1000 <= 999:
+        if right == 0:
+            return f"{left} hundred{s}"
+        elif right < 10:
+            return f"{left} oh {right}{s}"
+    return f"{left} {right}{s}"
+
+
+def flip_money(m):
+    m = m.group()
+    bill = "dollar" if m[0] == "$" else "pound"
+    if m[-1].isalpha():
+        return f"{m[1:]} {bill}s"
+    elif "." not in m:
+        s = "" if m[1:] == "1" else "s"
+        return f"{m[1:]} {bill}{s}"
+    b, c = m[1:].split(".")
+    s = "" if b == "1" else "s"
+    c = int(c.ljust(2, "0"))
+    coins = (
+        f"cent{'' if c == 1 else 's'}"
+        if m[0] == "$"
+        else ("penny" if c == 1 else "pence")
+    )
+    return f"{b} {bill}{s} and {c} {coins}"
+
+
+def point_num(num):
+    a, b = num.group().split(".")
+    return " point ".join([a, " ".join(b)])
+
+
+def normalize_text(text):
+    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+    text = text.replace("«", chr(8220)).replace("»", chr(8221))
+    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+    text = text.replace("(", "«").replace(")", "»")
+    for a, b in zip("、。！，：；？", ",.!,:;?"):
+        text = text.replace(a, b + " ")
+    text = re.sub(r"[^\S \n]", " ", text)
+    text = re.sub(r"  +", " ", text)
+    text = re.sub(r"(?<=\n) +(?=\n)", "", text)
+    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
+    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
+    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
+    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
+    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
+    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
+    text = re.sub(
+        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
+    )
+    text = re.sub(r"(?<=\d),(?=\d)", "", text)
+    text = re.sub(
+        r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
+        flip_money,
+        text,
+    )
+    text = re.sub(r"\d*\.\d+", point_num, text)
+    text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
+    text = re.sub(r"(?<=\d)S", " S", text)
+    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+    text = re.sub(r"(?<=X')S\b", "s", text)
+    text = re.sub(
+        r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
+    )
+    text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
+    return text.strip()
+
+
+def get_vocab():
+    _pad = "$"
+    _punctuation = ';:,.!?¡¿—…"«»“” '
+    _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+    dicts = {}
+    for i in range(len((symbols))):
+        dicts[symbols[i]] = i
+    return dicts
+
+
+VOCAB = get_vocab()
+
+
+def tokenize(ps):
+    return [i for i in map(VOCAB.get, ps) if i is not None]
+
+
+phonemizers = dict(
+    a=phonemizer.backend.EspeakBackend(
+        language="en-us", preserve_punctuation=True, with_stress=True
+    ),
+    b=phonemizer.backend.EspeakBackend(
+        language="en-gb", preserve_punctuation=True, with_stress=True
+    ),
+)
+
+
+def phonemize(text, lang, norm=True):
+    if norm:
+        text = normalize_text(text)
+    ps = phonemizers[lang].phonemize([text])
+    ps = ps[0] if ps else ""
+    # https://en.wiktionary.org/wiki/kokoro#English
+    ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
+    ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
+    ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
+    ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', "z", ps)
+    if lang == "a":
+        ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
+    ps = "".join(filter(lambda p: p in VOCAB, ps))
+    return ps.strip()
+
+
+def length_to_mask(lengths):
+    mask = (
+        torch.arange(lengths.max())
+        .unsqueeze(0)
+        .expand(lengths.shape[0], -1)
+        .type_as(lengths)
+    )
+    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
+    return mask
+
+
+@torch.no_grad()
+def forward(model, tokens, ref_s, speed):
+    device = ref_s.device
+    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+    text_mask = length_to_mask(input_lengths).to(device)
+    bert_dur = model["bert"](tokens, attention_mask=(~text_mask).int())
+    d_en = model["bert_encoder"](bert_dur).transpose(-1, -2)
+    s = ref_s[:, 128:]
+    d = model["predictor"].text_encoder(d_en, s, input_lengths, text_mask)
+    x, _ = model["predictor"].lstm(d)
+    duration = model["predictor"].duration_proj(x)
+    duration = torch.sigmoid(duration).sum(axis=-1) / speed
+    pred_dur = torch.round(duration).clamp(min=1).long()
+    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    c_frame = 0
+    for i in range(pred_aln_trg.size(0)):
+        pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
+        c_frame += pred_dur[0, i].item()
+    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+    F0_pred, N_pred = model["predictor"].F0Ntrain(en, s)
+    t_en = model["text_encoder"](tokens, input_lengths, text_mask)
+    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+    return (
+        model["decoder"](asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+    )
+
+
+def generate(model, text, voicepack, lang="a", speed=1, ps=None):
+    ps = ps or phonemize(text, lang)
+    tokens = tokenize(ps)
+    if not tokens:
+        return None
+    elif len(tokens) > 510:
+        tokens = tokens[:510]
+        print("Truncated to 510 tokens")
+    ref_s = voicepack[len(tokens)]
+    return forward(model, tokens, ref_s, speed)