diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts index bf0bc03c1..db505f552 100644 --- a/packages/cli/src/commands/tts.ts +++ b/packages/cli/src/commands/tts.ts @@ -8,7 +8,15 @@ export const examples: Example[] = [ ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'], ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'], [ - "Generate Spanish speech", + "Use the Supertonic engine", + 'hyperframes tts "Lightning-fast on-device speech" --engine supertonic --voice F1', + ], + [ + "Supertonic in another language", + 'hyperframes tts "안녕하세요" --engine supertonic --voice F2 --lang ko', + ], + [ + "Generate Spanish speech (Kokoro)", 'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav', ], [ @@ -17,27 +25,28 @@ export const examples: Example[] = [ ], ["Read text from a file", "hyperframes tts script.txt"], ["List available voices", "hyperframes tts --list"], + ["List Supertonic voices", "hyperframes tts --list --engine supertonic"], ]; import { resolve, extname } from "node:path"; import * as clack from "@clack/prompts"; import { c } from "../ui/colors.js"; import { errorBox } from "../ui/format.js"; import { - DEFAULT_VOICE, - BUNDLED_VOICES, - SUPPORTED_LANGS, - inferLangFromVoiceId, - isSupportedLang, - type SupportedLang, -} from "../tts/manager.js"; + DEFAULT_ENGINE, + ENGINE_IDS, + getEngine, + isEngineId, + type EngineId, + type TtsEngine, +} from "../tts/engine.js"; -const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", "); -const langList = SUPPORTED_LANGS.join(", "); +const engineList = ENGINE_IDS.join(", "); export default defineCommand({ meta: { name: "tts", - description: "Generate speech audio from text using a local AI model (Kokoro-82M)", + description: + "Generate speech audio from text using a local AI model (Kokoro-82M or Supertonic 3)", }, args: { input: { @@ -45,6 +54,11 @@ export default defineCommand({ description: "Text to speak, or path to a .txt file", required: false, }, + engine: { + type: "string", + description: `TTS engine (default: ${DEFAULT_ENGINE}). Options: ${engineList}`, + alias: "e", + }, output: { type: "string", description: "Output file path (default: speech.wav in current directory)", @@ -52,7 +66,7 @@ export default defineCommand({ }, voice: { type: "string", - description: `Voice ID (default: ${DEFAULT_VOICE}). Options: ${voiceList}`, + description: "Voice ID (engine-specific; see --list)", alias: "v", }, speed: { @@ -62,9 +76,13 @@ export default defineCommand({ }, lang: { type: "string", - description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`, + description: "Synthesis language (engine-specific; see --list)", alias: "l", }, + steps: { + type: "string", + description: "Supertonic only: flow-matching denoise steps (default: 8, higher = slower)", + }, list: { type: "boolean", description: "List available voices and exit", @@ -77,9 +95,13 @@ export default defineCommand({ }, }, async run({ args }) { + // ── Resolve engine ──────────────────────────────────────────────── + const engineId: EngineId = resolveEngine(args.engine, args.json); + const engine = await getEngine(engineId); + // ── List voices mode ────────────────────────────────────────────── if (args.list) { - return listVoices(args.json); + return listVoices(engine, args.json); } // ── Resolve input text ──────────────────────────────────────────── @@ -106,47 +128,45 @@ export default defineCommand({ process.exit(1); } - // ── Resolve output path ─────────────────────────────────────────── + // ── Resolve output path & params ────────────────────────────────── const output = resolve(args.output ?? "speech.wav"); - const voice = args.voice ?? DEFAULT_VOICE; - const speed = args.speed ? parseFloat(args.speed) : 1.0; + const voice = args.voice ?? engine.defaultVoice; + const speed = args.speed ? parseFloat(args.speed) : undefined; - if (isNaN(speed) || speed <= 0 || speed > 3) { + if (speed !== undefined && (isNaN(speed) || speed <= 0 || speed > 3)) { console.error(c.error("Speed must be a number between 0.1 and 3.0")); process.exit(1); } - const inferredLang = inferLangFromVoiceId(voice); - let lang: SupportedLang = inferredLang; - if (args.lang != null) { - const requested = String(args.lang).toLowerCase(); - if (!isSupportedLang(requested)) { - errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`); + let steps: number | undefined; + if (args.steps != null) { + steps = parseInt(args.steps, 10); + if (isNaN(steps) || steps < 1 || steps > 64) { + console.error(c.error("Steps must be an integer between 1 and 64")); process.exit(1); } - lang = requested; } - // Mismatched voice/lang is a valid stylization (English text, French - // phonemization for accent), so this is a hint, not an error. - if (!args.json && args.lang != null && lang !== inferredLang) { - console.log( - c.dim( - ` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`, - ), - ); + // ── Resolve language (engine validates its own codes) ───────────── + let lang: string; + try { + lang = engine.resolveLang(voice, args.lang ?? undefined); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + errorBox("Invalid --lang", message); + process.exit(1); } // ── Synthesize ──────────────────────────────────────────────────── - const { synthesize } = await import("../tts/synthesize.js"); const spin = args.json ? null : clack.spinner(); - spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`); + spin?.start(`Generating speech with ${engine.label} · ${c.accent(voice)} (${lang})...`); try { - const result = await synthesize(text, output, { + const result = await engine.synthesize(text, output, { voice, speed, lang, + steps, onProgress: spin ? (msg) => spin.message(msg) : undefined, }); @@ -154,10 +174,12 @@ export default defineCommand({ console.log( JSON.stringify({ ok: true, + engine: engine.id, voice, - speed, + speed: speed ?? null, lang, langApplied: result.langApplied, + sampleRate: result.sampleRate, durationSeconds: result.durationSeconds, outputPath: result.outputPath, }), @@ -171,7 +193,7 @@ export default defineCommand({ if (args.lang != null && !result.langApplied) { console.log( c.dim( - " Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.", + " Note: installed engine version does not support the --lang option; default phonemization was used.", ), ); } @@ -189,33 +211,41 @@ export default defineCommand({ }); // --------------------------------------------------------------------------- -// List voices +// Helpers // --------------------------------------------------------------------------- -function listVoices(json: boolean): void { - const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) })); +function resolveEngine(value: string | undefined, json: boolean): EngineId { + if (value == null) return DEFAULT_ENGINE; + const normalized = String(value).toLowerCase(); + if (!isEngineId(normalized)) { + const message = `Got "${value}". Must be one of: ${engineList}.`; + if (json) console.log(JSON.stringify({ ok: false, error: `Invalid --engine. ${message}` })); + else errorBox("Invalid --engine", message); + process.exit(1); + } + return normalized; +} + +function listVoices(engine: TtsEngine, json: boolean): void { + const voices = engine.listVoices(); + const rows = voices.map((v) => ({ ...v, defaultLang: engine.resolveLang(v.id) })); if (json) { - console.log(JSON.stringify(rows)); + console.log(JSON.stringify({ engine: engine.id, voices: rows })); return; } - console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`); + console.log(`\n${c.bold("Available voices")} (${engine.label})\n`); console.log( - ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`, + ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`, ); - console.log(` ${c.dim("─".repeat(72))}`); + console.log(` ${c.dim("─".repeat(76))}`); for (const row of rows) { const id = row.id.padEnd(18); const label = row.label.padEnd(13); - const lang = row.language.padEnd(10); + const lang = row.language.padEnd(13); const code = row.defaultLang.padEnd(10); console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`); } - console.log( - `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`, - ); - console.log( - ` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`, - ); + console.log(`\n ${c.dim(`Supported --lang codes: ${engine.supportedLangs.join(", ")}`)}\n`); } diff --git a/packages/cli/src/tts/engine.ts b/packages/cli/src/tts/engine.ts new file mode 100644 index 000000000..93427231a --- /dev/null +++ b/packages/cli/src/tts/engine.ts @@ -0,0 +1,93 @@ +// --------------------------------------------------------------------------- +// TTS engine abstraction +// +// HyperFrames ships more than one text-to-speech backend. Kokoro-82M runs +// through a Python subprocess (kokoro-onnx); Supertonic 3 runs fully +// in-process via onnxruntime-node. Both implement the `TtsEngine` interface +// below so `hyperframes tts` can switch between them with `--engine` without +// the command knowing anything engine-specific. +// --------------------------------------------------------------------------- + +export interface SynthesizeResult { + outputPath: string; + sampleRate: number; + durationSeconds: number; + /** + * Whether the requested phonemizer/language was actually applied. Kokoro + * may ignore `lang` on older installs; Supertonic always honors it. + */ + langApplied: boolean; +} + +export interface TtsVoice { + id: string; + label: string; + /** Human-readable language or locale label, e.g. "en-US" or "Multilingual". */ + language: string; + gender: "female" | "male"; +} + +export interface EngineSynthesizeOptions { + voice?: string; + /** Speech speed multiplier (engine-specific sane range; ~0.1–3.0). */ + speed?: number; + /** Phonemizer/synthesis language code in the engine's own vocabulary. */ + lang?: string; + /** + * Flow-matching denoise steps (Supertonic only, ~4–12). Ignored by engines + * that don't expose iterative sampling. + */ + steps?: number; + onProgress?: (message: string) => void; +} + +export interface TtsEngine { + /** Stable identifier used by the `--engine` flag. */ + readonly id: EngineId; + /** Display name shown in help and voice listings. */ + readonly label: string; + /** Voice ID used when the caller doesn't pass one. */ + readonly defaultVoice: string; + /** Voices this engine exposes for `tts --list`. */ + listVoices(): TtsVoice[]; + /** + * Resolve the language code for a given voice + optional explicit `--lang`. + * Returns the code to pass to `synthesize`. Throws if `requested` is not a + * valid code for this engine. + */ + resolveLang(voice: string, requested?: string): string; + /** Supported language codes, for help text and validation messages. */ + readonly supportedLangs: readonly string[]; + synthesize( + text: string, + outputPath: string, + options?: EngineSynthesizeOptions, + ): Promise; +} + +export type EngineId = "kokoro" | "supertonic"; + +export const ENGINE_IDS = ["kokoro", "supertonic"] as const; + +export const DEFAULT_ENGINE: EngineId = "kokoro"; + +export function isEngineId(value: string): value is EngineId { + return (ENGINE_IDS as readonly string[]).includes(value); +} + +/** + * Lazily construct a TTS engine by id. Engines are imported on demand so the + * CLI doesn't load onnxruntime-node (or probe for Python) until TTS is used. + */ +export async function getEngine(id: EngineId): Promise { + switch (id) { + case "kokoro": { + const { KokoroEngine } = await import("./engines/kokoro.js"); + return new KokoroEngine(); + } + case "supertonic": { + const { SupertonicEngine } = await import("./engines/supertonic/index.js"); + return new SupertonicEngine(); + } + } +} diff --git a/packages/cli/src/tts/engines/kokoro.ts b/packages/cli/src/tts/engines/kokoro.ts new file mode 100644 index 000000000..1d23ae6e0 --- /dev/null +++ b/packages/cli/src/tts/engines/kokoro.ts @@ -0,0 +1,61 @@ +// Kokoro-82M engine — a thin adapter over the existing kokoro-onnx Python +// pipeline (../synthesize.ts, ../manager.ts). Behavior is unchanged; this just +// exposes it through the shared TtsEngine interface. + +import type { + EngineId, + EngineSynthesizeOptions, + SynthesizeResult, + TtsEngine, + TtsVoice, +} from "../engine.js"; +import { synthesize } from "../synthesize.js"; +import { + BUNDLED_VOICES, + DEFAULT_VOICE, + SUPPORTED_LANGS, + inferLangFromVoiceId, + isSupportedLang, + type SupportedLang, +} from "../manager.js"; + +export class KokoroEngine implements TtsEngine { + readonly id: EngineId = "kokoro"; + readonly label = "Kokoro-82M"; + readonly defaultVoice = DEFAULT_VOICE; + readonly supportedLangs = SUPPORTED_LANGS; + + listVoices(): TtsVoice[] { + return BUNDLED_VOICES.map((v) => ({ + id: v.id, + label: v.label, + language: v.language, + gender: v.gender, + })); + } + + resolveLang(voice: string, requested?: string): string { + const inferred = inferLangFromVoiceId(voice); + if (requested == null) return inferred; + const normalized = requested.toLowerCase(); + if (!isSupportedLang(normalized)) { + throw new Error( + `Invalid --lang "${requested}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`, + ); + } + return normalized; + } + + synthesize( + text: string, + outputPath: string, + options?: EngineSynthesizeOptions, + ): Promise { + return synthesize(text, outputPath, { + voice: options?.voice, + speed: options?.speed, + lang: options?.lang as SupportedLang | undefined, + onProgress: options?.onProgress, + }); + } +} diff --git a/packages/cli/src/tts/engines/supertonic/index.ts b/packages/cli/src/tts/engines/supertonic/index.ts new file mode 100644 index 000000000..e8eaf635d --- /dev/null +++ b/packages/cli/src/tts/engines/supertonic/index.ts @@ -0,0 +1,118 @@ +// Supertonic 3 engine — on-device multilingual TTS via onnxruntime-node. +// Unlike the Kokoro engine, this runs the full pipeline in-process (no Python). + +import { dirname } from "node:path"; +import { mkdirSync, existsSync } from "node:fs"; +import type { + EngineId, + EngineSynthesizeOptions, + SynthesizeResult, + TtsEngine, + TtsVoice, +} from "../../engine.js"; +import { + DEFAULT_VOICE, + ensureModels, + ensureVoice, + isSupertonicVoice, + type SupertonicVoiceId, +} from "./manager.js"; +import { SUPPORTED_LANGS, isSupertonicLang } from "./runtime.js"; + +const DEFAULT_LANG = "en"; +const DEFAULT_STEPS = 8; + +// Preset voices shipped on Hugging Face. Supertonic styles are multilingual — +// the speaker identity is independent of the synthesis language (passed via +// --lang), so language is labelled "Multilingual". +const VOICES: TtsVoice[] = [ + { id: "F1", label: "Female 1", language: "Multilingual", gender: "female" }, + { id: "F2", label: "Female 2", language: "Multilingual", gender: "female" }, + { id: "F3", label: "Female 3", language: "Multilingual", gender: "female" }, + { id: "F4", label: "Female 4", language: "Multilingual", gender: "female" }, + { id: "F5", label: "Female 5", language: "Multilingual", gender: "female" }, + { id: "M1", label: "Male 1", language: "Multilingual", gender: "male" }, + { id: "M2", label: "Male 2", language: "Multilingual", gender: "male" }, + { id: "M3", label: "Male 3", language: "Multilingual", gender: "male" }, + { id: "M4", label: "Male 4", language: "Multilingual", gender: "male" }, + { id: "M5", label: "Male 5", language: "Multilingual", gender: "male" }, +]; + +export class SupertonicEngine implements TtsEngine { + readonly id: EngineId = "supertonic"; + readonly label = "Supertonic 3"; + readonly defaultVoice = DEFAULT_VOICE; + readonly supportedLangs = SUPPORTED_LANGS; + + listVoices(): TtsVoice[] { + return VOICES; + } + + resolveLang(_voice: string, requested?: string): string { + if (requested == null) return DEFAULT_LANG; + const normalized = requested.toLowerCase(); + if (!isSupertonicLang(normalized)) { + throw new Error( + `Invalid --lang "${requested}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`, + ); + } + return normalized; + } + + async synthesize( + text: string, + outputPath: string, + options?: EngineSynthesizeOptions, + ): Promise { + const voiceId = options?.voice ?? DEFAULT_VOICE; + if (!isSupertonicVoice(voiceId)) { + throw new Error( + `Unknown Supertonic voice "${voiceId}". Options: ${VOICES.map((v) => v.id).join(", ")}.`, + ); + } + const voice: SupertonicVoiceId = voiceId; + + const speed = options?.speed ?? 1.05; + const lang = options?.lang ?? DEFAULT_LANG; + const steps = options?.steps ?? DEFAULT_STEPS; + if (!isSupertonicLang(lang)) { + throw new Error(`Invalid language "${lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`); + } + + // 1. Ensure assets are downloaded (models once, voice once). + const [onnxDir, voicePath] = await Promise.all([ + ensureModels({ onProgress: options?.onProgress }), + ensureVoice(voice, { onProgress: options?.onProgress }), + ]); + + // 2. Load the ONNX pipeline and the selected voice style. + options?.onProgress?.("Loading Supertonic models..."); + const { loadTextToSpeech, loadVoiceStyle, writeWavFile } = await import("./runtime.js"); + const tts = await loadTextToSpeech(onnxDir); + const style = loadVoiceStyle([voicePath]); + + // 3. Synthesize. + options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`); + const { wav, duration } = await tts.call(text, lang, style, steps, speed); + + // Trim trailing padding to the predicted duration, matching the upstream + // example's per-item slice. + const durationSeconds = duration[0] ?? 0; + const sampleCount = Math.floor(tts.sampleRate * durationSeconds); + const samples = sampleCount > 0 ? wav.slice(0, sampleCount) : wav; + + mkdirSync(dirname(outputPath), { recursive: true }); + writeWavFile(outputPath, samples, tts.sampleRate); + + if (!existsSync(outputPath)) { + throw new Error("Synthesis completed but no output file was created"); + } + + return { + outputPath, + sampleRate: tts.sampleRate, + durationSeconds: Math.round(durationSeconds * 1000) / 1000, + langApplied: true, + }; + } +} diff --git a/packages/cli/src/tts/engines/supertonic/manager.ts b/packages/cli/src/tts/engines/supertonic/manager.ts new file mode 100644 index 000000000..f158466d0 --- /dev/null +++ b/packages/cli/src/tts/engines/supertonic/manager.ts @@ -0,0 +1,88 @@ +// Supertonic 3 asset manager — downloads the ONNX models, config, and preset +// voice styles from Hugging Face on first use and caches them under +// ~/.cache/hyperframes/tts/supertonic/. Mirrors the Kokoro manager's +// download-on-demand pattern (../../manager.ts). + +import { existsSync, mkdirSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { downloadFile } from "../../../utils/download.js"; + +const CACHE_DIR = join(homedir(), ".cache", "hyperframes", "tts", "supertonic"); +const ONNX_DIR = join(CACHE_DIR, "onnx"); +const VOICES_DIR = join(CACHE_DIR, "voice_styles"); + +// Repo layout: https://huggingface.co/Supertone/supertonic-3 cloned into the +// `assets/` dir the upstream examples expect, so `onnx/` and `voice_styles/` +// are top-level there. `resolve/main/` serves the raw (LFS) bytes. +const HF_BASE = "https://huggingface.co/Supertone/supertonic-3/resolve/main"; + +// Files the inference pipeline loads from the onnx dir (see runtime.ts). +const ONNX_FILES = [ + "duration_predictor.onnx", + "text_encoder.onnx", + "vector_estimator.onnx", + "vocoder.onnx", + "tts.json", + "unicode_indexer.json", +] as const; + +// Preset speaker embeddings. Small JSON files (~KB each). +const VOICE_FILES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"] as const; + +export type SupertonicVoiceId = (typeof VOICE_FILES)[number]; + +export const DEFAULT_VOICE: SupertonicVoiceId = "F1"; + +/** + * Ensure all ONNX models + config are present. Returns the directory path to + * pass to `loadTextToSpeech`. Downloads any missing files (the .onnx models + * total a few hundred MB; downloaded once, then cached). + */ +export async function ensureModels(options?: { + onProgress?: (message: string) => void; +}): Promise { + mkdirSync(ONNX_DIR, { recursive: true }); + + const missing = ONNX_FILES.filter((f) => !existsSync(join(ONNX_DIR, f))); + if (missing.length === 0) return ONNX_DIR; + + options?.onProgress?.( + `Downloading Supertonic models (${missing.length} file${missing.length === 1 ? "" : "s"}, ~300 MB on first run)...`, + ); + + // Sequential to keep progress legible and avoid hammering the CDN. + for (const file of missing) { + const dest = join(ONNX_DIR, file); + options?.onProgress?.(`Downloading ${file}...`); + await downloadFile(`${HF_BASE}/onnx/${file}`, dest); + if (!existsSync(dest)) { + throw new Error(`Supertonic model download failed: ${file}`); + } + } + + return ONNX_DIR; +} + +/** + * Ensure a single preset voice-style JSON is present and return its path. + */ +export async function ensureVoice( + voice: SupertonicVoiceId, + options?: { onProgress?: (message: string) => void }, +): Promise { + mkdirSync(VOICES_DIR, { recursive: true }); + const dest = join(VOICES_DIR, `${voice}.json`); + if (existsSync(dest)) return dest; + + options?.onProgress?.(`Downloading voice ${voice}...`); + await downloadFile(`${HF_BASE}/voice_styles/${voice}.json`, dest); + if (!existsSync(dest)) { + throw new Error(`Supertonic voice download failed: ${voice}`); + } + return dest; +} + +export function isSupertonicVoice(value: string): value is SupertonicVoiceId { + return (VOICE_FILES as readonly string[]).includes(value); +} diff --git a/packages/cli/src/tts/engines/supertonic/runtime.ts b/packages/cli/src/tts/engines/supertonic/runtime.ts new file mode 100644 index 000000000..d4ad8bec4 --- /dev/null +++ b/packages/cli/src/tts/engines/supertonic/runtime.ts @@ -0,0 +1,550 @@ +// Supertonic 3 inference runtime — a faithful TypeScript port of the upstream +// Node reference implementation (supertonic/nodejs/helper.js). The pipeline +// runs entirely in-process via onnxruntime-node: no Python, no subprocess. +// +// Stages: Unicode tokenization → duration prediction → text encoding → +// Gaussian latent sampling → iterative flow-matching denoise → vocoder. +// The numeric logic mirrors upstream exactly; only types and ESM/TS idioms +// were added. See https://github.com/supertone-inc/supertonic. + +import { readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import * as ort from "onnxruntime-node"; + +const AVAILABLE_LANGS = [ + "en", + "ko", + "ja", + "ar", + "bg", + "cs", + "da", + "de", + "el", + "es", + "et", + "fi", + "fr", + "hi", + "hr", + "hu", + "id", + "it", + "lt", + "lv", + "nl", + "pl", + "pt", + "ro", + "ru", + "sk", + "sl", + "sv", + "tr", + "uk", + "vi", + "na", +] as const; + +export type SupertonicLang = (typeof AVAILABLE_LANGS)[number]; + +export function isSupertonicLang(value: string): value is SupertonicLang { + return (AVAILABLE_LANGS as readonly string[]).includes(value); +} + +export const SUPPORTED_LANGS = AVAILABLE_LANGS; + +// --------------------------------------------------------------------------- +// Config & tensor helpers +// --------------------------------------------------------------------------- + +interface TtsConfig { + ae: { sample_rate: number; base_chunk_size: number }; + ttl: { chunk_compress_factor: number; latent_dim: number }; +} + +type Nested = number | Nested[]; + +/** Recursively flatten a (possibly ragged) nested number array — `arr.flat(Infinity)`. */ +function flatten(arr: Nested[]): number[] { + const out: number[] = []; + const walk = (x: Nested): void => { + if (Array.isArray(x)) { + for (const item of x) walk(item); + } else { + out.push(x); + } + }; + for (const item of arr) walk(item); + return out; +} + +function arrayToTensor(array: Nested[], dims: number[]): ort.Tensor { + return new ort.Tensor("float32", Float32Array.from(flatten(array)), dims); +} + +function intArrayToTensor(array: Nested[], dims: number[]): ort.Tensor { + const flat = flatten(array); + return new ort.Tensor("int64", BigInt64Array.from(flat.map((x) => BigInt(x))), dims); +} + +function tensorToNumbers(t: ort.Tensor): number[] { + return Array.from(t.data as ArrayLike); +} + +/** Convert per-item lengths to a [B, 1, maxLen] binary mask. */ +function lengthToMask(lengths: number[], maxLen?: number): number[][][] { + const max = maxLen ?? Math.max(...lengths); + const mask: number[][][] = []; + for (const len of lengths) { + const row: number[] = []; + for (let j = 0; j < max; j++) { + row.push(j < len ? 1.0 : 0.0); + } + mask.push([row]); // [B, 1, maxLen] + } + return mask; +} + +function getLatentMask( + wavLengths: number[], + baseChunkSize: number, + chunkCompressFactor: number, +): number[][][] { + const latentSize = baseChunkSize * chunkCompressFactor; + const latentLengths = wavLengths.map((len) => Math.floor((len + latentSize - 1) / latentSize)); + return lengthToMask(latentLengths); +} + +// --------------------------------------------------------------------------- +// Unicode text processing +// --------------------------------------------------------------------------- + +class UnicodeProcessor { + private readonly indexer: Record; + + constructor(unicodeIndexerJsonPath: string) { + this.indexer = JSON.parse(readFileSync(unicodeIndexerJsonPath, "utf8")); + } + + private preprocessText(text: string, lang: string): string { + text = text.normalize("NFKD"); + + // Remove emojis (wide Unicode range). + const emojiPattern = + /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu; + text = text.replace(emojiPattern, ""); + + const replacements: Record = { + "–": "-", + "‑": "-", + "—": "-", + _: " ", + "“": '"', + "”": '"', + "‘": "'", + "’": "'", + "´": "'", + "`": "'", + "[": " ", + "]": " ", + "|": " ", + "/": " ", + "#": " ", + "→": " ", + "←": " ", + }; + for (const [k, v] of Object.entries(replacements)) { + text = text.replaceAll(k, v); + } + + text = text.replace(/[♥☆♡©\\]/g, ""); + + const exprReplacements: Record = { + "@": " at ", + "e.g.,": "for example, ", + "i.e.,": "that is, ", + }; + for (const [k, v] of Object.entries(exprReplacements)) { + text = text.replaceAll(k, v); + } + + // Fix spacing around punctuation. + text = text.replace(/ ,/g, ","); + text = text.replace(/ \./g, "."); + text = text.replace(/ !/g, "!"); + text = text.replace(/ \?/g, "?"); + text = text.replace(/ ;/g, ";"); + text = text.replace(/ :/g, ":"); + text = text.replace(/ '/g, "'"); + + // Collapse duplicate quotes. + while (text.includes('""')) text = text.replace('""', '"'); + while (text.includes("''")) text = text.replace("''", "'"); + while (text.includes("``")) text = text.replace("``", "`"); + + text = text.replace(/\s+/g, " ").trim(); + + // Append a period if it doesn't already end with terminal punctuation. + if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) { + text += "."; + } + + if (!AVAILABLE_LANGS.includes(lang as SupertonicLang)) { + throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(", ")}`); + } + + return `<${lang}>${text}`; + } + + private textToUnicodeValues(text: string): number[] { + return Array.from(text).map((char) => char.charCodeAt(0)); + } + + call(textList: string[], langList: string[]): { textIds: number[][]; textMask: number[][][] } { + const processedTexts = textList.map((t, i) => this.preprocessText(t, langList[i]!)); + const textIdsLengths = processedTexts.map((t) => t.length); + const maxLen = Math.max(...textIdsLengths); + + const textIds: number[][] = []; + for (const processed of processedTexts) { + const row = new Array(maxLen).fill(0); + const unicodeVals = this.textToUnicodeValues(processed); + for (let j = 0; j < unicodeVals.length; j++) { + row[j] = this.indexer[String(unicodeVals[j])] ?? 0; + } + textIds.push(row); + } + + const textMask = lengthToMask(textIdsLengths); + return { textIds, textMask }; + } +} + +// --------------------------------------------------------------------------- +// Voice style +// --------------------------------------------------------------------------- + +// Exported as the return type of loadVoiceStyle (required for declaration emit). +// fallow-ignore-next-line unused-exports +export class Style { + constructor( + readonly ttl: ort.Tensor, + readonly dp: ort.Tensor, + ) {} +} + +interface VoiceStyleJson { + style_ttl: { dims: number[]; data: Nested[] }; + style_dp: { dims: number[]; data: Nested[] }; +} + +/** + * Load one or more preset voice-style JSON files into a batched Style. All + * files must share the same tensor dimensions (they do, for v3 presets). + */ +export function loadVoiceStyle(voiceStylePaths: string[]): Style { + const bsz = voiceStylePaths.length; + + const first: VoiceStyleJson = JSON.parse(readFileSync(voiceStylePaths[0]!, "utf8")); + const ttlDims = first.style_ttl.dims; + const dpDims = first.style_dp.dims; + + const ttlDim1 = ttlDims[1]!; + const ttlDim2 = ttlDims[2]!; + const dpDim1 = dpDims[1]!; + const dpDim2 = dpDims[2]!; + + const ttlFlat = new Float32Array(bsz * ttlDim1 * ttlDim2); + const dpFlat = new Float32Array(bsz * dpDim1 * dpDim2); + + for (let i = 0; i < bsz; i++) { + const voiceStyle: VoiceStyleJson = JSON.parse(readFileSync(voiceStylePaths[i]!, "utf8")); + ttlFlat.set(flatten(voiceStyle.style_ttl.data), i * ttlDim1 * ttlDim2); + dpFlat.set(flatten(voiceStyle.style_dp.data), i * dpDim1 * dpDim2); + } + + const ttlStyle = new ort.Tensor("float32", ttlFlat, [bsz, ttlDim1, ttlDim2]); + const dpStyle = new ort.Tensor("float32", dpFlat, [bsz, dpDim1, dpDim2]); + return new Style(ttlStyle, dpStyle); +} + +// --------------------------------------------------------------------------- +// TextToSpeech pipeline +// --------------------------------------------------------------------------- + +// Exported as the return type of loadTextToSpeech (required for declaration emit). +// fallow-ignore-next-line unused-exports +export class TextToSpeech { + readonly sampleRate: number; + private readonly baseChunkSize: number; + private readonly chunkCompressFactor: number; + private readonly ldim: number; + + constructor( + cfgs: TtsConfig, + private readonly textProcessor: UnicodeProcessor, + private readonly dpOrt: ort.InferenceSession, + private readonly textEncOrt: ort.InferenceSession, + private readonly vectorEstOrt: ort.InferenceSession, + private readonly vocoderOrt: ort.InferenceSession, + ) { + this.sampleRate = cfgs.ae.sample_rate; + this.baseChunkSize = cfgs.ae.base_chunk_size; + this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor; + this.ldim = cfgs.ttl.latent_dim; + } + + private sampleNoisyLatent(duration: number[]): { + noisyLatent: number[][][]; + latentMask: number[][][]; + } { + const wavLenMax = Math.max(...duration) * this.sampleRate; + const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate)); + const chunkSize = this.baseChunkSize * this.chunkCompressFactor; + const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); + const latentDim = this.ldim * this.chunkCompressFactor; + + const noisyLatent: number[][][] = []; + for (let b = 0; b < duration.length; b++) { + const batch: number[][] = []; + for (let d = 0; d < latentDim; d++) { + const row: number[] = []; + for (let t = 0; t < latentLen; t++) { + // Box-Muller transform for a standard normal sample. + const eps = 1e-10; + const u1 = Math.max(eps, Math.random()); + const u2 = Math.random(); + row.push(Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2)); + } + batch.push(row); + } + noisyLatent.push(batch); + } + + const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor); + + for (let b = 0; b < noisyLatent.length; b++) { + for (let d = 0; d < noisyLatent[b]!.length; d++) { + for (let t = 0; t < noisyLatent[b]![d]!.length; t++) { + noisyLatent[b]![d]![t]! *= latentMask[b]![0]![t]!; + } + } + } + + return { noisyLatent, latentMask }; + } + + private async infer( + textList: string[], + langList: string[], + style: Style, + totalStep: number, + speed = 1.05, + ): Promise<{ wav: number[]; duration: number[] }> { + if (textList.length !== style.ttl.dims[0]) { + throw new Error("Number of texts must match number of style vectors"); + } + const bsz = textList.length; + const { textIds, textMask } = this.textProcessor.call(textList, langList); + const textIdsShape = [bsz, textIds[0]!.length]; + const textMaskShape = [bsz, 1, textMask[0]![0]!.length]; + + const textMaskTensor = arrayToTensor(textMask, textMaskShape); + + const dpResult = await this.dpOrt.run({ + text_ids: intArrayToTensor(textIds, textIdsShape), + style_dp: style.dp, + text_mask: textMaskTensor, + }); + + const durOnnx = tensorToNumbers(dpResult.duration!); + // Faster speech → shorter duration. + for (let i = 0; i < durOnnx.length; i++) { + durOnnx[i]! /= speed; + } + + const textEncResult = await this.textEncOrt.run({ + text_ids: intArrayToTensor(textIds, textIdsShape), + style_ttl: style.ttl, + text_mask: textMaskTensor, + }); + const textEmbTensor = textEncResult.text_emb!; + + const { noisyLatent, latentMask } = this.sampleNoisyLatent(durOnnx); + const latentShape = [bsz, noisyLatent[0]!.length, noisyLatent[0]![0]!.length]; + const latentMaskShape = [bsz, 1, latentMask[0]![0]!.length]; + + const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); + + const totalStepTensor = arrayToTensor(new Array(bsz).fill(totalStep), [bsz]); + + for (let step = 0; step < totalStep; step++) { + const currentStepArray = new Array(bsz).fill(step); + + const vectorEstResult = await this.vectorEstOrt.run({ + noisy_latent: arrayToTensor(noisyLatent, latentShape), + text_emb: textEmbTensor, + style_ttl: style.ttl, + text_mask: textMaskTensor, + latent_mask: latentMaskTensor, + total_step: totalStepTensor, + current_step: arrayToTensor(currentStepArray, [bsz]), + }); + + const denoisedLatent = tensorToNumbers(vectorEstResult.denoised_latent!); + + let idx = 0; + for (let b = 0; b < noisyLatent.length; b++) { + for (let d = 0; d < noisyLatent[b]!.length; d++) { + for (let t = 0; t < noisyLatent[b]![d]!.length; t++) { + noisyLatent[b]![d]![t] = denoisedLatent[idx++]!; + } + } + } + } + + const vocoderResult = await this.vocoderOrt.run({ + latent: arrayToTensor(noisyLatent, latentShape), + }); + + return { wav: tensorToNumbers(vocoderResult.wav_tts!), duration: durOnnx }; + } + + /** + * Single-speaker synthesis with automatic chunking for long text. Chunks are + * joined with `silenceDuration` seconds of silence. + */ + async call( + text: string, + lang: string, + style: Style, + totalStep: number, + speed = 1.05, + silenceDuration = 0.3, + ): Promise<{ wav: number[]; duration: number[] }> { + if (style.ttl.dims[0] !== 1) { + throw new Error("Single speaker text to speech only supports a single style"); + } + const maxLen = lang === "ko" || lang === "ja" ? 120 : 300; + const textList = chunkText(text, maxLen); + + let wavCat: number[] | null = null; + let durCat = 0; + + for (const chunk of textList) { + const { wav, duration } = await this.infer([chunk], [lang], style, totalStep, speed); + if (wavCat === null) { + wavCat = wav; + durCat = duration[0]!; + } else { + const silenceLen = Math.floor(silenceDuration * this.sampleRate); + const silence = new Array(silenceLen).fill(0); + wavCat = [...wavCat, ...silence, ...wav]; + durCat += duration[0]! + silenceDuration; + } + } + + return { wav: wavCat ?? [], duration: [durCat] }; + } + + /** Batch synthesis (one style + lang per text), no automatic chunking. */ + async batch( + textList: string[], + langList: string[], + style: Style, + totalStep: number, + speed = 1.05, + ): Promise<{ wav: number[]; duration: number[] }> { + return this.infer(textList, langList, style, totalStep, speed); + } +} + +// --------------------------------------------------------------------------- +// Loaders +// --------------------------------------------------------------------------- + +/** + * Load the four ONNX models + config + tokenizer from `onnxDir`. CPU only; + * upstream has not yet shipped a GPU path. + */ +export async function loadTextToSpeech(onnxDir: string): Promise { + const cfgs: TtsConfig = JSON.parse(readFileSync(join(onnxDir, "tts.json"), "utf8")); + const opts = {}; + + const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = await Promise.all([ + ort.InferenceSession.create(join(onnxDir, "duration_predictor.onnx"), opts), + ort.InferenceSession.create(join(onnxDir, "text_encoder.onnx"), opts), + ort.InferenceSession.create(join(onnxDir, "vector_estimator.onnx"), opts), + ort.InferenceSession.create(join(onnxDir, "vocoder.onnx"), opts), + ]); + + const textProcessor = new UnicodeProcessor(join(onnxDir, "unicode_indexer.json")); + return new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt); +} + +/** Write a mono 16-bit PCM WAV file. Samples are clamped to [-1, 1]. */ +export function writeWavFile(filename: string, audioData: number[], sampleRate: number): void { + const numChannels = 1; + const bitsPerSample = 16; + const byteRate = (sampleRate * numChannels * bitsPerSample) / 8; + const blockAlign = (numChannels * bitsPerSample) / 8; + const dataSize = (audioData.length * bitsPerSample) / 8; + + const buffer = Buffer.alloc(44 + dataSize); + + buffer.write("RIFF", 0); + buffer.writeUInt32LE(36 + dataSize, 4); + buffer.write("WAVE", 8); + + buffer.write("fmt ", 12); + buffer.writeUInt32LE(16, 16); + buffer.writeUInt16LE(1, 20); // PCM + buffer.writeUInt16LE(numChannels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(byteRate, 28); + buffer.writeUInt16LE(blockAlign, 32); + buffer.writeUInt16LE(bitsPerSample, 34); + + buffer.write("data", 36); + buffer.writeUInt32LE(dataSize, 40); + + for (let i = 0; i < audioData.length; i++) { + const sample = Math.max(-1, Math.min(1, audioData[i]!)); + buffer.writeInt16LE(Math.floor(sample * 32767), 44 + i * 2); + } + + writeFileSync(filename, buffer); +} + +/** Split text into <= maxLen segments on paragraph then sentence boundaries. */ +function chunkText(text: string, maxLen = 300): string[] { + const paragraphs = text + .trim() + .split(/\n\s*\n+/) + .filter((p) => p.trim()); + + const chunks: string[] = []; + + for (let paragraph of paragraphs) { + paragraph = paragraph.trim(); + if (!paragraph) continue; + + const sentences = paragraph.split( + /(? { const tmp = `${dest}.tmp`; return new Promise((resolve, reject) => { - const follow = (u: string) => { + const follow = (u: string, redirects = 0) => { httpsGet(u, (res) => { - if (res.statusCode === 301 || res.statusCode === 302) { + if ([301, 302, 303, 307, 308].includes(res.statusCode ?? 0)) { const location = res.headers.location; if (location) { - follow(location); + if (redirects >= 10) { + reject(new Error("Download failed: too many redirects")); + return; + } + follow(new URL(location, u).toString(), redirects + 1); return; } } diff --git a/skills/hyperframes-media/SKILL.md b/skills/hyperframes-media/SKILL.md index 13e3d4ae0..c6a296adb 100644 --- a/skills/hyperframes-media/SKILL.md +++ b/skills/hyperframes-media/SKILL.md @@ -1,6 +1,6 @@ --- name: hyperframes-media -description: Asset preprocessing for HyperFrames compositions — text-to-speech narration (Kokoro), audio/video transcription (Whisper), and background removal for transparent overlays (u2net). Use when generating voiceover from text, transcribing speech for captions, removing the background from a video or image to use as a transparent overlay, choosing a TTS voice or whisper model, or chaining these (TTS → transcribe → captions). Each command downloads its own model on first run. +description: Asset preprocessing for HyperFrames compositions — text-to-speech narration (Kokoro for English/Chinese, Supertonic 3 for 31 languages), audio/video transcription (Whisper), and background removal for transparent overlays (u2net). Use when generating voiceover from text, transcribing speech for captions, removing the background from a video or image to use as a transparent overlay, choosing a TTS engine/voice or whisper model, or chaining these (TTS → transcribe → captions). Each command downloads its own model on first run. --- # HyperFrames Media Preprocessing @@ -9,14 +9,29 @@ Three CLI commands that produce assets for compositions: `tts` (speech), `transc ## Text-to-Speech (`tts`) -Generate speech audio locally with Kokoro-82M. No API key. +Generate speech audio locally. No API key. Two engines, selected with `--engine`: + +- **`kokoro`** (default) — Kokoro-82M. 54 named voices, content-matched. Best for **English**; the only engine that supports **Chinese**. Non-English needs Python + `espeak-ng` (see Requirements). +- **`supertonic`** — Supertonic 3. Runs fully in-process (no Python, no phonemizer). Covers **31 languages**. Preferred for **any non-English language except Chinese**. ```bash npx hyperframes tts "Text here" --voice af_nova --output narration.wav npx hyperframes tts script.txt --voice bf_emma --output narration.wav -npx hyperframes tts --list # all 54 voices +npx hyperframes tts --list # Kokoro's 54 voices +npx hyperframes tts --list --engine supertonic # Supertonic's voices ``` +### Choosing an engine (language routing) + +| Language | Engine | Why | +| ---------------------------------------------------------------- | ----------------------- | -------------------------------- | +| English | `kokoro` (default) | Rich, content-matched voices | +| Chinese / Mandarin | `kokoro` (`zf_xiaobei`) | **Only** Kokoro supports `zh` | +| Any other language (Korean, German, Russian, Arabic, Dutch, …) | `supertonic` | Kokoro can't; no extra deps | +| Overlap (Spanish, French, Hindi, Italian, Japanese, Portuguese) | either | Supertonic if avoiding espeak-ng | + +Rule of thumb: **English or Chinese → Kokoro. Everything else → Supertonic.** + ### Voice Selection Match voice to content. Default is `af_heart`. @@ -31,14 +46,23 @@ Match voice to content. Default is `af_heart`. ### Multilingual -Voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from the prefix — no `--lang` needed when the voice matches the text. +For non-English (except Chinese), use **Supertonic**. Pass the language with `--lang` and pick a voice (`F1`–`F5`, `M1`–`M5` — multilingual, gender only). No phonemizer or system packages needed. + +```bash +npx hyperframes tts "안녕하세요, 만나서 반갑습니다" --engine supertonic --lang ko --voice F1 --output ko.wav +npx hyperframes tts "Guten Tag, schön Sie zu sehen" --engine supertonic --lang de --voice M1 --output de.wav +``` + +Supertonic `--lang` codes (31): `ar` `bg` `cs` `da` `de` `el` `en` `es` `et` `fi` `fr` `hi` `hr` `hu` `id` `it` `ja` `ko` `lt` `lv` `nl` `pl` `pt` `ro` `ru` `sk` `sl` `sv` `tr` `uk` `vi`. **No Chinese** — use Kokoro for that. + +**Kokoro** stays best for English and is the only option for Chinese. Its voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from the prefix — no `--lang` needed when the voice matches the text. ```bash +npx hyperframes tts "你好,今天天气很好" --voice zf_xiaobei --output zh.wav # Chinese → Kokoro npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav -npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav ``` -Use `--lang` only to override auto-detection (stylized accents). Valid codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. Non-English phonemization requires `espeak-ng` system-wide (`brew install espeak-ng` / `apt-get install espeak-ng`). +For Kokoro, use `--lang` only to override auto-detection (stylized accents). Valid codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. Kokoro non-English phonemization requires `espeak-ng` system-wide (`brew install espeak-ng` / `apt-get install espeak-ng`) — another reason to prefer Supertonic for those languages. ### Speed @@ -53,7 +77,8 @@ For more than a few paragraphs, write to a `.txt` file and pass the path. Inputs ### Requirements -Python 3.8+ with `kokoro-onnx` and `soundfile` (`pip install kokoro-onnx soundfile`). Model downloads on first use (~311 MB + ~27 MB voices, cached in `~/.cache/hyperframes/tts/`). +- **Kokoro** (`--engine kokoro`, default): Python 3.8+ with `kokoro-onnx` and `soundfile` (`pip install kokoro-onnx soundfile`); non-English also needs `espeak-ng` system-wide. Model downloads on first use (~311 MB + ~27 MB voices, cached in `~/.cache/hyperframes/tts/`). +- **Supertonic** (`--engine supertonic`): no Python, no system packages — runs in-process via onnxruntime-node. Models (~300 MB) download from Hugging Face on first use, cached in `~/.cache/hyperframes/tts/supertonic/`. Tune quality/speed with `--steps` (default 8; fewer is faster). ## Transcription (`transcribe`)