Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,371 changes: 28 additions & 1,343 deletions kokoro.js/package-lock.json

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions kokoro.js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@
"license": "Apache-2.0",
"description": "High-quality text-to-speech for the web",
"dependencies": {
"@huggingface/transformers": "^3.5.1",
"phonemizer": "^1.2.1"
"@huggingface/transformers": "^3.5.1"
},
"devDependencies": {
"@rollup/plugin-node-resolve": "^16.0.0",
Expand Down
4,347 changes: 4,347 additions & 0 deletions kokoro.js/src/espeakng/espeakng.worker.js

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions kokoro.js/src/espeakng/phonemizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import Module from "./espeakng.worker.js";

const workerPromise = new Promise((resolve) => {
if (Module.calledRun) {
resolve(new Module.eSpeakNGWorker());
} else {
Module.onRuntimeInitialized = () => resolve(new Module.eSpeakNGWorker());
}
});

const SUPPORTED_LANGUAGES = [
"en", // English
"it", // Italian
];

const initCache = workerPromise.then((worker) => {
const voices = worker
.list_voices()
.map(({ name, identifier, languages }) => ({
name,
identifier,
languages: languages.filter(
/** @param {{name: string; priority: number}} [lang] */
(lang) => SUPPORTED_LANGUAGES.includes(lang.name.split("-")[0]),
),
}))
.filter(
/** @param {{languages: {name: string; priority: number}[]}} [voice] */
(voice) => voice.languages.length > 0,
);

// Generate list of supported language identifiers:
const identifiers = new Set();
for (const voice of voices) {
identifiers.add(voice.identifier);
for (const lang of voice.languages) {
identifiers.add(lang.name);
}
}

return { voices, identifiers };
});

/**
* List the available voices for the specified language.
* @param {string} [language] The language identifier
* @returns {Promise<{name: string; identifier: string; languages: {name: string; priority: number}[]}>} A list of available voices
*/
export const list_voices = async (language) => {
const { voices } = await initCache;
if (!language) return voices;
const base = language.split("-")[0];
return voices.filter(
/** @param {{languages: {name: string; priority: number}[]}} [voice] */
(voice) => voice.languages.some((lang) => lang.name === base || lang.name.startsWith(base + "-")),
);
};

/**
* Multilingual text to phonemes converter
*
* @param {string} text The input text
* @param {string} [language] The language identifier
* @returns {Promise<string[]>} A phonemized version of the input
*/
export const phonemize = async (text, language = "en-us") => {
const worker = await workerPromise;

const { identifiers } = await initCache;
if (!identifiers.has(language)) {
throw new Error(`Invalid language identifier: "${language}". Should be one of: ${Array.from(identifiers).toSorted().join(", ")}.`);
}
worker.set_voice(language);

return (
worker
.synthesize_ipa(text)
.ipa?.split("\n")
.filter(
/** @param {string} [x] */
(x) => x.length > 0,
) ?? []
);
};
17 changes: 11 additions & 6 deletions kokoro.js/src/kokoro.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,18 @@ export class KokoroTTS {
console.table(VOICES);
}

/**
* Validate input voice against supported voices
*
* @param {string} voice The voice to validate
*/
_validate_voice(voice) {
if (!VOICES.hasOwnProperty(voice)) {
console.error(`Voice "${voice}" not found. Available voices:`);
console.table(VOICES);
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
}
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
const language = /** @type {"a"|"b"|"i"} */ (voice.at(0));
return language;
}

Expand Down Expand Up @@ -126,9 +131,9 @@ export class KokoroTTS {
splitter = new TextSplitterStream();
const chunks = split_pattern
? text
.split(split_pattern)
.map((chunk) => chunk.trim())
.filter((chunk) => chunk.length > 0)
.split(split_pattern)
.map((chunk) => chunk.trim())
.filter((chunk) => chunk.length > 0)
: [text];
splitter.push(...chunks);
} else {
Expand All @@ -151,10 +156,10 @@ export class KokoroTTS {

export const env = {
set cacheDir(value) {
hf.cacheDir = value
hf.cacheDir = value;
},
get cacheDir() {
return hf.cacheDir
return hf.cacheDir;
},
set wasmPaths(value) {
hf.backends.onnx.wasm.wasmPaths = value;
Expand Down
22 changes: 19 additions & 3 deletions kokoro.js/src/phonemize.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { phonemize as espeakng } from "phonemizer";
import { phonemize as espeakng } from "./espeakng/phonemizer.js";

/**
* Helper function to split a string on a regex, but keep the delimiters.
Expand Down Expand Up @@ -167,7 +167,7 @@ const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*
/**
* Phonemize text using the eSpeak-NG phonemizer
* @param {string} text The text to phonemize
* @param {"a"|"b"} language The language to use
* @param {"a"|"b"|"i"} language The language to use
* @param {boolean} norm Whether to normalize the text
* @returns {Promise<string>} The phonemized text
*/
Expand All @@ -181,7 +181,7 @@ export async function phonemize(text, language = "a", norm = true) {
const sections = split(text, PUNCTUATION_PATTERN);

// 3. Convert each section to phonemes
const lang = language === "a" ? "en-us" : "en";
const lang = getLocale(language);
const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");

// 4. Post-process phonemes
Expand All @@ -202,3 +202,19 @@ export async function phonemize(text, language = "a", norm = true) {
}
return processed.trim();
}

/**
* Get the locale for a particular language prefix
*
* @param {"a"|"b"|"i"} language The language for which to get the locale of
*/
function getLocale(language) {
switch (language) {
case "a":
return "en-us";
case "b":
return "en";
case "i":
return "it";
}
}
46 changes: 22 additions & 24 deletions kokoro.js/src/voices.js
Original file line number Diff line number Diff line change
Expand Up @@ -376,22 +376,22 @@ export const VOICES = Object.freeze({
// targetQuality: "B",
// overallGrade: "C",
// },
// if_sara: {
// name: "sara",
// language: "it",
// gender: "Female",
// traits: "🚺",
// targetQuality: "B",
// overallGrade: "C",
// },
// im_nicola: {
// name: "nicola",
// language: "it",
// gender: "Male",
// traits: "🚹",
// targetQuality: "B",
// overallGrade: "C",
// },
if_sara: {
name: "sara",
language: "it",
gender: "Female",
traits: "🚺",
targetQuality: "B",
overallGrade: "C",
},
im_nicola: {
name: "nicola",
language: "it",
gender: "Male",
traits: "🚹",
targetQuality: "B",
overallGrade: "C",
},
// pf_dora: {
// name: "dora",
// language: "pt-br",
Expand All @@ -418,43 +418,41 @@ export const VOICES = Object.freeze({
// },
});


/**
* The base URL for fetching voice data files.
*/
let voiceDataUrl = "https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/voices";


/**
* Retrieves the current voice data URL.
*
*
* @returns The current voice data URL.
*/
export function getVoiceDataUrl() {
return voiceDataUrl;
};
}

/**
* Sets a new voice data URL.
*
*
* @param url - The new URL to set for voice data.
* @throws Will throw an error if the URL is not a valid non-empty string.
*/
export function setVoiceDataUrl(url) {
if (typeof url === 'string' && url.trim() !== '') {
if (typeof url === "string" && url.trim() !== "") {
voiceDataUrl = url;
} else {
throw new Error("Invalid URL");
}
};
}

/**
*
* @param {keyof typeof VOICES} id
* @returns {Promise<ArrayBufferLike>}
*/
async function getVoiceFile(id) {
if (fs && Object.hasOwn(fs, 'readFile')) {
if (fs && Object.hasOwn(fs, "readFile")) {
const dirname = typeof __dirname !== "undefined" ? __dirname : import.meta.dirname;
const file = path.resolve(dirname, `../voices/${id}.bin`);
const { buffer } = await fs.readFile(file);
Expand Down
15 changes: 12 additions & 3 deletions kokoro.js/tests/phonemize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ const A_TEST_CASES = new Map([
["Yeah", "jˈɛə"],
["yeah", "jˈɛə"],
["1990", "nˈaɪntiːn nˈaɪndi"],
["12:34", "twˈɛlv θˈɜːɾi fˈoːɹ"],
["12:34", "twˈɛlv θˈɜːɾi fˈɔːɹ"],
["2022s", "twˈɛnti twˈɛnti tˈuːz"],
["1,000", "wˈʌn θˈaʊzənd"],
["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪd fˈoːɹɾi fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪd sˈɛvənti ˈeɪt"],
["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪd fˈɔːɹɾi fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪd sˈɛvənti ˈeɪt"],
["$100", "wˈʌn hˈʌndɹɪd dˈɑːlɚz"],
["£1.50", "wˈʌn pˈaʊnd ænd fˈɪfti pˈɛns"],
["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈoːɹ"],
["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈɔːɹ"],
["0.01", "zˈiəɹoʊ pˈɔɪnt zˈiəɹoʊ wˈʌn"],
["10-20", "tˈɛn tə twˈɛnti"],
["5-10", "fˈaɪv tə tˈɛn"],
Expand Down Expand Up @@ -77,6 +77,8 @@ const B_TEST_CASES = new Map([
["X's mark", "ˈɛksɪz mˈɑːk"],
]);

const I_TEST_CASES = new Map([["Ma la volpe col suo balzo ha raggiunto il quieto Fido", "ma la vˈolpe kol sˌʊo bˈaltso a ɹadʒːˈunto il kwjˈɛto fˈido"]]);

describe("phonemize", () => {
describe("en-us", () => {
for (const [input, expected] of A_TEST_CASES) {
Expand All @@ -92,4 +94,11 @@ describe("phonemize", () => {
});
}
});
describe("it", () => {
for (const [input, expected] of I_TEST_CASES) {
test(`phonemize("${input}")`, async () => {
expect(await phonemize(input, "i")).toEqual(expected);
});
}
});
});