Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 82 additions & 52 deletions packages/cli/src/commands/tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@ export const examples: Example[] = [
["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
[
"Generate Spanish speech",
"Use the Supertonic engine",
'hyperframes tts "Lightning-fast on-device speech" --engine supertonic --voice F1',
],
[
"Supertonic in another language",
'hyperframes tts "안녕하세요" --engine supertonic --voice F2 --lang ko',
],
[
"Generate Spanish speech (Kokoro)",
'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
],
[
Expand All @@ -17,42 +25,48 @@ export const examples: Example[] = [
],
["Read text from a file", "hyperframes tts script.txt"],
["List available voices", "hyperframes tts --list"],
["List Supertonic voices", "hyperframes tts --list --engine supertonic"],
];
import { resolve, extname } from "node:path";
import * as clack from "@clack/prompts";
import { c } from "../ui/colors.js";
import { errorBox } from "../ui/format.js";
import {
DEFAULT_VOICE,
BUNDLED_VOICES,
SUPPORTED_LANGS,
inferLangFromVoiceId,
isSupportedLang,
type SupportedLang,
} from "../tts/manager.js";
DEFAULT_ENGINE,
ENGINE_IDS,
getEngine,
isEngineId,
type EngineId,
type TtsEngine,
} from "../tts/engine.js";

const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
const langList = SUPPORTED_LANGS.join(", ");
const engineList = ENGINE_IDS.join(", ");

export default defineCommand({
meta: {
name: "tts",
description: "Generate speech audio from text using a local AI model (Kokoro-82M)",
description:
"Generate speech audio from text using a local AI model (Kokoro-82M or Supertonic 3)",
},
args: {
input: {
type: "positional",
description: "Text to speak, or path to a .txt file",
required: false,
},
engine: {
type: "string",
description: `TTS engine (default: ${DEFAULT_ENGINE}). Options: ${engineList}`,
alias: "e",
},
output: {
type: "string",
description: "Output file path (default: speech.wav in current directory)",
alias: "o",
},
voice: {
type: "string",
description: `Voice ID (default: ${DEFAULT_VOICE}). Options: ${voiceList}`,
description: "Voice ID (engine-specific; see --list)",
alias: "v",
},
speed: {
Expand All @@ -62,9 +76,13 @@ export default defineCommand({
},
lang: {
type: "string",
description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
description: "Synthesis language (engine-specific; see --list)",
alias: "l",
},
steps: {
type: "string",
description: "Supertonic only: flow-matching denoise steps (default: 8, higher = slower)",
},
list: {
type: "boolean",
description: "List available voices and exit",
Expand All @@ -77,9 +95,13 @@ export default defineCommand({
},
},
async run({ args }) {
// ── Resolve engine ────────────────────────────────────────────────
const engineId: EngineId = resolveEngine(args.engine, args.json);
const engine = await getEngine(engineId);

// ── List voices mode ──────────────────────────────────────────────
if (args.list) {
return listVoices(args.json);
return listVoices(engine, args.json);
}

// ── Resolve input text ────────────────────────────────────────────
Expand All @@ -106,58 +128,58 @@ export default defineCommand({
process.exit(1);
}

// ── Resolve output path ───────────────────────────────────────────
// ── Resolve output path & params ──────────────────────────────────
const output = resolve(args.output ?? "speech.wav");
const voice = args.voice ?? DEFAULT_VOICE;
const speed = args.speed ? parseFloat(args.speed) : 1.0;
const voice = args.voice ?? engine.defaultVoice;
const speed = args.speed ? parseFloat(args.speed) : undefined;

if (isNaN(speed) || speed <= 0 || speed > 3) {
if (speed !== undefined && (isNaN(speed) || speed <= 0 || speed > 3)) {
console.error(c.error("Speed must be a number between 0.1 and 3.0"));
process.exit(1);
}

const inferredLang = inferLangFromVoiceId(voice);
let lang: SupportedLang = inferredLang;
if (args.lang != null) {
const requested = String(args.lang).toLowerCase();
if (!isSupportedLang(requested)) {
errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
let steps: number | undefined;
if (args.steps != null) {
steps = parseInt(args.steps, 10);
if (isNaN(steps) || steps < 1 || steps > 64) {
console.error(c.error("Steps must be an integer between 1 and 64"));
process.exit(1);
}
lang = requested;
}

// Mismatched voice/lang is a valid stylization (English text, French
// phonemization for accent), so this is a hint, not an error.
if (!args.json && args.lang != null && lang !== inferredLang) {
console.log(
c.dim(
` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
),
);
// ── Resolve language (engine validates its own codes) ─────────────
let lang: string;
try {
lang = engine.resolveLang(voice, args.lang ?? undefined);
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
errorBox("Invalid --lang", message);
process.exit(1);
}

// ── Synthesize ────────────────────────────────────────────────────
const { synthesize } = await import("../tts/synthesize.js");
const spin = args.json ? null : clack.spinner();
spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
spin?.start(`Generating speech with ${engine.label} · ${c.accent(voice)} (${lang})...`);

try {
const result = await synthesize(text, output, {
const result = await engine.synthesize(text, output, {
voice,
speed,
lang,
steps,
onProgress: spin ? (msg) => spin.message(msg) : undefined,
});

if (args.json) {
console.log(
JSON.stringify({
ok: true,
engine: engine.id,
voice,
speed,
speed: speed ?? null,
lang,
langApplied: result.langApplied,
sampleRate: result.sampleRate,
durationSeconds: result.durationSeconds,
outputPath: result.outputPath,
}),
Expand All @@ -171,7 +193,7 @@ export default defineCommand({
if (args.lang != null && !result.langApplied) {
console.log(
c.dim(
" Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
" Note: installed engine version does not support the --lang option; default phonemization was used.",
),
);
}
Expand All @@ -189,33 +211,41 @@ export default defineCommand({
});

// ---------------------------------------------------------------------------
// List voices
// Helpers
// ---------------------------------------------------------------------------

function listVoices(json: boolean): void {
const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
function resolveEngine(value: string | undefined, json: boolean): EngineId {
if (value == null) return DEFAULT_ENGINE;
const normalized = String(value).toLowerCase();
if (!isEngineId(normalized)) {
const message = `Got "${value}". Must be one of: ${engineList}.`;
if (json) console.log(JSON.stringify({ ok: false, error: `Invalid --engine. ${message}` }));
else errorBox("Invalid --engine", message);
process.exit(1);
}
return normalized;
}

function listVoices(engine: TtsEngine, json: boolean): void {
const voices = engine.listVoices();
const rows = voices.map((v) => ({ ...v, defaultLang: engine.resolveLang(v.id) }));

if (json) {
console.log(JSON.stringify(rows));
console.log(JSON.stringify({ engine: engine.id, voices: rows }));
return;
}

console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
console.log(`\n${c.bold("Available voices")} (${engine.label})\n`);
console.log(
` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`,
` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`,
);
console.log(` ${c.dim("─".repeat(72))}`);
console.log(` ${c.dim("─".repeat(76))}`);
for (const row of rows) {
const id = row.id.padEnd(18);
const label = row.label.padEnd(13);
const lang = row.language.padEnd(10);
const lang = row.language.padEnd(13);
const code = row.defaultLang.padEnd(10);
console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
}
console.log(
`\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
);
console.log(
` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
);
console.log(`\n ${c.dim(`Supported --lang codes: ${engine.supportedLangs.join(", ")}`)}\n`);
}
93 changes: 93 additions & 0 deletions packages/cli/src/tts/engine.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// ---------------------------------------------------------------------------
// TTS engine abstraction
//
// HyperFrames ships more than one text-to-speech backend. Kokoro-82M runs
// through a Python subprocess (kokoro-onnx); Supertonic 3 runs fully
// in-process via onnxruntime-node. Both implement the `TtsEngine` interface
// below so `hyperframes tts` can switch between them with `--engine` without
// the command knowing anything engine-specific.
// ---------------------------------------------------------------------------

export interface SynthesizeResult {
outputPath: string;
sampleRate: number;
durationSeconds: number;
/**
* Whether the requested phonemizer/language was actually applied. Kokoro
* may ignore `lang` on older installs; Supertonic always honors it.
*/
langApplied: boolean;
}

export interface TtsVoice {
id: string;
label: string;
/** Human-readable language or locale label, e.g. "en-US" or "Multilingual". */
language: string;
gender: "female" | "male";
}

export interface EngineSynthesizeOptions {
voice?: string;
/** Speech speed multiplier (engine-specific sane range; ~0.1–3.0). */
speed?: number;
/** Phonemizer/synthesis language code in the engine's own vocabulary. */
lang?: string;
/**
* Flow-matching denoise steps (Supertonic only, ~4–12). Ignored by engines
* that don't expose iterative sampling.
*/
steps?: number;
onProgress?: (message: string) => void;
}

export interface TtsEngine {
/** Stable identifier used by the `--engine` flag. */
readonly id: EngineId;
/** Display name shown in help and voice listings. */
readonly label: string;
/** Voice ID used when the caller doesn't pass one. */
readonly defaultVoice: string;
/** Voices this engine exposes for `tts --list`. */
listVoices(): TtsVoice[];
/**
* Resolve the language code for a given voice + optional explicit `--lang`.
* Returns the code to pass to `synthesize`. Throws if `requested` is not a
* valid code for this engine.
*/
resolveLang(voice: string, requested?: string): string;
/** Supported language codes, for help text and validation messages. */
readonly supportedLangs: readonly string[];
synthesize(
text: string,
outputPath: string,
options?: EngineSynthesizeOptions,
): Promise<SynthesizeResult>;
}

export type EngineId = "kokoro" | "supertonic";

export const ENGINE_IDS = ["kokoro", "supertonic"] as const;

export const DEFAULT_ENGINE: EngineId = "kokoro";

export function isEngineId(value: string): value is EngineId {
return (ENGINE_IDS as readonly string[]).includes(value);
}

/**
* Lazily construct a TTS engine by id. Engines are imported on demand so the
* CLI doesn't load onnxruntime-node (or probe for Python) until TTS is used.
*/
export async function getEngine(id: EngineId): Promise<TtsEngine> {
switch (id) {
case "kokoro": {
const { KokoroEngine } = await import("./engines/kokoro.js");
return new KokoroEngine();
}
case "supertonic": {
const { SupertonicEngine } = await import("./engines/supertonic/index.js");
return new SupertonicEngine();
}
}
}
Loading