Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions demo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
git \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

RUN apt-get install espeak-ng -y

RUN useradd -m -u 1000 user

USER user
Expand All @@ -18,6 +20,7 @@ ENV HOME=/home/user \
WORKDIR $HOME/app

RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
RUN pip3 install phonemizer
RUN pip3 install document-to-podcast

COPY --chown=user . $HOME/app
Expand Down
17 changes: 13 additions & 4 deletions demo/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Streamlit app for converting documents to podcasts."""

import io
import os
import re
from pathlib import Path
import io

import numpy as np
import soundfile as sf
Expand All @@ -28,7 +29,10 @@ def load_text_to_text_model():

@st.cache_resource
def load_text_to_speech_model():
return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
if os.environ.get("HF_SPACE") == "TRUE":
return load_tts_model("hexgrad/Kokoro-82M/kokoro-v0_19.pth")
else:
return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")


def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
Expand Down Expand Up @@ -115,10 +119,15 @@ def gen_button_clicked():
text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

if os.environ.get("HF_SPACE") == "TRUE":
tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
else:
tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"

st.markdown(
"For this demo, we are using the following models: \n"
"- [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n"
"- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
f"{tts_link}\n"
)
st.markdown(
"You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
Expand Down Expand Up @@ -187,7 +196,7 @@ def gen_button_clicked():

if st.session_state[gen_button]:
audio_np = stack_audio_segments(
st.session_state.audio, speech_model.sample_rate
st.session_state.audio, speech_model.sample_rate, silence_pad=0.0
)
audio_wav = numpy_to_wav(audio_np, speech_model.sample_rate)
if st.download_button(
Expand Down
26 changes: 18 additions & 8 deletions demo/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,18 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl\n",
"%pip install --quiet document-to-podcast"
"%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl\n",
"%pip install --quiet git+https://github.com/mozilla-ai/document-to-podcast.git@text-to-speech-model\n",
"%pip install --quiet phonemizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!apt-get -qq -y install espeak-ng"
]
},
{
Expand Down Expand Up @@ -173,7 +183,7 @@
"source": [
"For this demo, we are using the following models:\n",
" - [Qwen2.5-3B-Instruct](https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF)\n",
" - [OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
" - [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
]
},
{
Expand All @@ -197,7 +207,7 @@
"text_model = load_llama_cpp_model(\n",
" \"bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf\"\n",
")\n",
"speech_model = load_tts_model(\"OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf\")"
"speech_model = load_tts_model(\"hexgrad/Kokoro-82M/kokoro-v0_19.pth\")"
]
},
{
Expand Down Expand Up @@ -247,15 +257,15 @@
"speakers = [\n",
" {\n",
" \"id\": 1,\n",
" \"name\": \"Laura\",\n",
" \"name\": \"Sarah\",\n",
" \"description\": \"The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.\",\n",
" \"voice_profile\": \"female_1\",\n",
" \"voice_profile\": \"af_sarah\",\n",
" },\n",
" {\n",
" \"id\": 2,\n",
" \"name\": \"Jon\",\n",
" \"name\": \"Michael\",\n",
" \"description\": \"The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.\",\n",
" \"voice_profile\": \"male_1\",\n",
" \"voice_profile\": \"am_michael\",\n",
" },\n",
"]\n",
"\n",
Expand Down
1 change: 0 additions & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ These docs are your companion to mastering the **Document-to-Podcast Blueprint**
### Built with
- Python 3.10+
- [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
- [Streamlit](https://streamlit.io/) (UI demo)


Expand Down
6 changes: 3 additions & 3 deletions src/document_to_podcast/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def document_to_podcast(
- {output_folder}/podcast.txt
- {output_folder}/podcast.wav

text_to_text_model (str, optional): The path to the text-to-text model.
text_to_text_model (str, optional): The text-to-text model_id.

Need to be formatted as `owner/repo/file`.

Expand All @@ -63,8 +63,8 @@ def document_to_podcast(
text_to_text_prompt (str, optional): The prompt for the text-to-text model.
Defaults to DEFAULT_PROMPT.

text_to_speech_model (str, optional): The path to the text-to-speech model.
Defaults to `OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf`.
text_to_speech_model (str, optional): The text-to-speech model_id.
Defaults to `OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf`.

speakers (list[Speaker] | None, optional): The speakers for the podcast.
Defaults to DEFAULT_SPEAKERS.
Expand Down
Empty file.
184 changes: 184 additions & 0 deletions src/document_to_podcast/inference/kokoro/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import phonemizer
import re
import torch


def split_num(num):
num = num.group()
if "." in num:
return num
elif ":" in num:
h, m = [int(n) for n in num.split(":")]
if m == 0:
return f"{h} o'clock"
elif m < 10:
return f"{h} oh {m}"
return f"{h} {m}"
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
s = "s" if num.endswith("s") else ""
if 100 <= year % 1000 <= 999:
if right == 0:
return f"{left} hundred{s}"
elif right < 10:
return f"{left} oh {right}{s}"
return f"{left} {right}{s}"


def flip_money(m):
m = m.group()
bill = "dollar" if m[0] == "$" else "pound"
if m[-1].isalpha():
return f"{m[1:]} {bill}s"
elif "." not in m:
s = "" if m[1:] == "1" else "s"
return f"{m[1:]} {bill}{s}"
b, c = m[1:].split(".")
s = "" if b == "1" else "s"
c = int(c.ljust(2, "0"))
coins = (
f"cent{'' if c == 1 else 's'}"
if m[0] == "$"
else ("penny" if c == 1 else "pence")
)
return f"{b} {bill}{s} and {c} {coins}"


def point_num(num):
a, b = num.group().split(".")
return " point ".join([a, " ".join(b)])


def normalize_text(text):
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace("«", chr(8220)).replace("»", chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace("(", "«").replace(")", "»")
for a, b in zip("、。!,:;?", ",.!,:;?"):
text = text.replace(a, b + " ")
text = re.sub(r"[^\S \n]", " ", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
text = re.sub(
r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
)
text = re.sub(r"(?<=\d),(?=\d)", "", text)
text = re.sub(
r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
flip_money,
text,
)
text = re.sub(r"\d*\.\d+", point_num, text)
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
text = re.sub(r"(?<=\d)S", " S", text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", "s", text)
text = re.sub(
r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
)
text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
return text.strip()


def get_vocab():
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
return dicts


VOCAB = get_vocab()


def tokenize(ps):
return [i for i in map(VOCAB.get, ps) if i is not None]


phonemizers = dict(
a=phonemizer.backend.EspeakBackend(
language="en-us", preserve_punctuation=True, with_stress=True
),
b=phonemizer.backend.EspeakBackend(
language="en-gb", preserve_punctuation=True, with_stress=True
),
)


def phonemize(text, lang, norm=True):
if norm:
text = normalize_text(text)
ps = phonemizers[lang].phonemize([text])
ps = ps[0] if ps else ""
# https://en.wiktionary.org/wiki/kokoro#English
ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', "z", ps)
if lang == "a":
ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)
ps = "".join(filter(lambda p: p in VOCAB, ps))
return ps.strip()


def length_to_mask(lengths):
mask = (
torch.arange(lengths.max())
.unsqueeze(0)
.expand(lengths.shape[0], -1)
.type_as(lengths)
)
mask = torch.gt(mask + 1, lengths.unsqueeze(1))
return mask


@torch.no_grad()
def forward(model, tokens, ref_s, speed):
device = ref_s.device
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
bert_dur = model["bert"](tokens, attention_mask=(~text_mask).int())
d_en = model["bert_encoder"](bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model["predictor"].text_encoder(d_en, s, input_lengths, text_mask)
x, _ = model["predictor"].lstm(d)
duration = model["predictor"].duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame : c_frame + pred_dur[0, i].item()] = 1
c_frame += pred_dur[0, i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model["predictor"].F0Ntrain(en, s)
t_en = model["text_encoder"](tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return (
model["decoder"](asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
)


def generate(model, text, voicepack, lang="a", speed=1, ps=None):
ps = ps or phonemize(text, lang)
tokens = tokenize(ps)
if not tokens:
return None
elif len(tokens) > 510:
tokens = tokens[:510]
print("Truncated to 510 tokens")
ref_s = voicepack[len(tokens)]
return forward(model, tokens, ref_s, speed)
Loading
Loading