Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions kokoro.js/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions kokoro.js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"description": "High-quality text-to-speech for the web",
"dependencies": {
"@huggingface/transformers": "^3.5.1",
"phonemize": "^1.1.0",
"phonemizer": "^1.2.1"
},
"devDependencies": {
Expand Down
15 changes: 9 additions & 6 deletions kokoro.js/src/kokoro.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ export class KokoroTTS {
* Create a new KokoroTTS instance.
* @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
* @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
* @param {"1.0"|"1.1"} version The version of the Chinese phonemization style
*/
constructor(model, tokenizer) {
constructor(model, tokenizer, version = "1.0") {
this.model = model;
this.tokenizer = tokenizer;
this.version = version;
}

/**
Expand All @@ -42,8 +44,9 @@ export class KokoroTTS {
const model = StyleTextToSpeech2Model.from_pretrained(model_id, { progress_callback, dtype, device });
const tokenizer = AutoTokenizer.from_pretrained(model_id, { progress_callback });

const info = await Promise.all([model, tokenizer]);
return new KokoroTTS(...info);
const version = model_id.includes("v1.1-zh") ? "1.1" : "1.0";

return new KokoroTTS(await model, await tokenizer, version);
}

get voices() {
Expand All @@ -60,7 +63,7 @@ export class KokoroTTS {
console.table(VOICES);
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
}
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
const language = /** @type {"a"|"b"|"z"} */ (voice.at(0)); // "a" or "b" or "z"
return language;
}

Expand All @@ -74,7 +77,7 @@ export class KokoroTTS {
async generate(text, { voice = "af_heart", speed = 1 } = {}) {
const language = this._validate_voice(voice);

const phonemes = await phonemize(text, language);
const phonemes = await phonemize(text, language, this.version);
const { input_ids } = this.tokenizer(phonemes, {
truncation: true,
});
Expand Down Expand Up @@ -135,7 +138,7 @@ export class KokoroTTS {
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
}
for await (const sentence of splitter) {
const phonemes = await phonemize(sentence, language);
const phonemes = await phonemize(sentence, language, this.version);
const { input_ids } = this.tokenizer(phonemes, {
truncation: true,
});
Expand Down
26 changes: 21 additions & 5 deletions kokoro.js/src/phonemize.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { phonemize as espeakng } from "phonemizer";
import { toZhuyin, toIPA } from "phonemize/zh";

/**
* Helper function to split a string on a regex, but keep the delimiters.
Expand Down Expand Up @@ -164,25 +165,40 @@ function escapeRegExp(string) {
const PUNCTUATION = ';:,.!?¡¿—…"«»“”(){}[]';
const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*)+`, "g");

const ZH_PATTERN = /[\u4e00-\u9fff]+/;
const MIXED_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*|${ZH_PATTERN.source})`, "g");

/**
* Phonemize text using the eSpeak-NG phonemizer
* @param {string} text The text to phonemize
* @param {"a"|"b"} language The language to use
* @param {"a"|"b"|"z"} language The language to use
* @param {"1.0"|"1.1"} version The version of the Chinese phonemization style
* @param {boolean} norm Whether to normalize the text
* @returns {Promise<string>} The phonemized text
*/
export async function phonemize(text, language = "a", norm = true) {
export async function phonemize(text, language = "a", version = "1.0", norm = true) {
// 1. Normalize text
if (norm) {
text = normalize_text(text);
}

// 2. Split into chunks, to ensure we preserve punctuation
const sections = split(text, PUNCTUATION_PATTERN);
// 2. Split into chunks, to ensure we preserve punctuation and separate Chinese text if needed
const sections = split(text, language === "z" ? MIXED_PATTERN : PUNCTUATION_PATTERN);

// 3. Convert each section to phonemes
const lang = language === "a" ? "en-us" : "en";
const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");
const processedSections = await Promise.all(sections.map(async ({ match, text }) => {
if (!match) {
return (await espeakng(text, lang)).join(" ");
}
if (ZH_PATTERN.test(text)) {
return version === "1.0" ? toIPA(text, { toneFormat: "arrow" }) : toZhuyin(text);
} else {
return text;
}
}));

const ps = processedSections.join("");

// 4. Post-process phonemes
let processed = ps
Expand Down
128 changes: 64 additions & 64 deletions kokoro.js/src/voices.js
Original file line number Diff line number Diff line change
Expand Up @@ -248,70 +248,70 @@ export const VOICES = Object.freeze({
// targetQuality: "B",
// overallGrade: "C-",
// },
// zf_xiaobei: {
// name: "xiaobei",
// language: "zh",
// gender: "Female",
// traits: "🚺",
// targetQuality: "C",
// overallGrade: "D",
// },
// zf_xiaoni: {
// name: "xiaoni",
// language: "zh",
// gender: "Female",
// traits: "🚺",
// targetQuality: "C",
// overallGrade: "D",
// },
// zf_xiaoxiao: {
// name: "xiaoxiao",
// language: "zh",
// gender: "Female",
// traits: "🚺",
// targetQuality: "C",
// overallGrade: "D",
// },
// zf_xiaoyi: {
// name: "xiaoyi",
// language: "zh",
// gender: "Female",
// traits: "🚺",
// targetQuality: "C",
// overallGrade: "D",
// },
// zm_yunjian: {
// name: "yunjian",
// language: "zh",
// gender: "Male",
// traits: "🚹",
// targetQuality: "C",
// overallGrade: "D",
// },
// zm_yunxi: {
// name: "yunxi",
// language: "zh",
// gender: "Male",
// traits: "🚹",
// targetQuality: "C",
// overallGrade: "D",
// },
// zm_yunxia: {
// name: "yunxia",
// language: "zh",
// gender: "Male",
// traits: "🚹",
// targetQuality: "C",
// overallGrade: "D",
// },
// zm_yunyang: {
// name: "yunyang",
// language: "zh",
// gender: "Male",
// traits: "🚹",
// targetQuality: "C",
// overallGrade: "D",
// },
zf_xiaobei: {
name: "xiaobei",
language: "zh",
gender: "Female",
traits: "🚺",
targetQuality: "C",
overallGrade: "D",
},
zf_xiaoni: {
name: "xiaoni",
language: "zh",
gender: "Female",
traits: "🚺",
targetQuality: "C",
overallGrade: "D",
},
zf_xiaoxiao: {
name: "xiaoxiao",
language: "zh",
gender: "Female",
traits: "🚺",
targetQuality: "C",
overallGrade: "D",
},
zf_xiaoyi: {
name: "xiaoyi",
language: "zh",
gender: "Female",
traits: "🚺",
targetQuality: "C",
overallGrade: "D",
},
zm_yunjian: {
name: "yunjian",
language: "zh",
gender: "Male",
traits: "🚹",
targetQuality: "C",
overallGrade: "D",
},
zm_yunxi: {
name: "yunxi",
language: "zh",
gender: "Male",
traits: "🚹",
targetQuality: "C",
overallGrade: "D",
},
zm_yunxia: {
name: "yunxia",
language: "zh",
gender: "Male",
traits: "🚹",
targetQuality: "C",
overallGrade: "D",
},
zm_yunyang: {
name: "yunyang",
language: "zh",
gender: "Male",
traits: "🚹",
targetQuality: "C",
overallGrade: "D",
},
// ef_dora: {
// name: "dora",
// language: "es",
Expand Down
22 changes: 22 additions & 0 deletions kokoro.js/tests/phonemize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ const B_TEST_CASES = new Map([
["X's mark", "ˈɛksɪz mˈɑːk"],
]);

const Z_TEST_CASES = new Map([
["中文, test", "ʈʂʊŋ→ wən↗, tˈɛst"],
]);

const Z_V2_TEST_CASES = new Map([
["中文, test", "ㄓㄨㄥ1 ㄨㄣ2, tˈɛst"],
]);

describe("phonemize", () => {
describe("en-us", () => {
for (const [input, expected] of A_TEST_CASES) {
Expand All @@ -92,4 +100,18 @@ describe("phonemize", () => {
});
}
});
describe("zh v1", () => {
for (const [input, expected] of Z_TEST_CASES) {
test(`phonemize("${input}")`, async () => {
expect(await phonemize(input, "z", "1.0")).toEqual(expected);
});
}
});
describe("zh v2", () => {
for (const [input, expected] of Z_V2_TEST_CASES) {
test(`phonemize("${input}")`, async () => {
expect(await phonemize(input, "z", "1.1")).toEqual(expected);
});
}
});
});