heygen-com · tantara · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
@@ -8,7 +8,15 @@ export const examples: Example[] = [
   ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
   ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
   [
-    "Generate Spanish speech",
+    "Use the Supertonic engine",
+    'hyperframes tts "Lightning-fast on-device speech" --engine supertonic --voice F1',
+  ],
+  [
+    "Supertonic in another language",
+    'hyperframes tts "안녕하세요" --engine supertonic --voice F2 --lang ko',
+  ],
+  [
+    "Generate Spanish speech (Kokoro)",
     'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
   ],
   [
@@ -17,42 +25,48 @@ export const examples: Example[] = [
   ],
   ["Read text from a file", "hyperframes tts script.txt"],
   ["List available voices", "hyperframes tts --list"],
+  ["List Supertonic voices", "hyperframes tts --list --engine supertonic"],
 ];
 import { resolve, extname } from "node:path";
 import * as clack from "@clack/prompts";
 import { c } from "../ui/colors.js";
 import { errorBox } from "../ui/format.js";
 import {
-  DEFAULT_VOICE,
-  BUNDLED_VOICES,
-  SUPPORTED_LANGS,
-  inferLangFromVoiceId,
-  isSupportedLang,
-  type SupportedLang,
-} from "../tts/manager.js";
+  DEFAULT_ENGINE,
+  ENGINE_IDS,
+  getEngine,
+  isEngineId,
+  type EngineId,
+  type TtsEngine,
+} from "../tts/engine.js";
 
-const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
-const langList = SUPPORTED_LANGS.join(", ");
+const engineList = ENGINE_IDS.join(", ");
 
 export default defineCommand({
   meta: {
     name: "tts",
-    description: "Generate speech audio from text using a local AI model (Kokoro-82M)",
+    description:
+      "Generate speech audio from text using a local AI model (Kokoro-82M or Supertonic 3)",
   },
   args: {
     input: {
       type: "positional",
       description: "Text to speak, or path to a .txt file",
       required: false,
     },
+    engine: {
+      type: "string",
+      description: `TTS engine (default: ${DEFAULT_ENGINE}). Options: ${engineList}`,
+      alias: "e",
+    },
     output: {
       type: "string",
       description: "Output file path (default: speech.wav in current directory)",
       alias: "o",
     },
     voice: {
       type: "string",
-      description: `Voice ID (default: ${DEFAULT_VOICE}). Options: ${voiceList}`,
+      description: "Voice ID (engine-specific; see --list)",
       alias: "v",
     },
     speed: {
@@ -62,9 +76,13 @@ export default defineCommand({
     },
     lang: {
       type: "string",
-      description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
+      description: "Synthesis language (engine-specific; see --list)",
       alias: "l",
     },
+    steps: {
+      type: "string",
+      description: "Supertonic only: flow-matching denoise steps (default: 8, higher = slower)",
+    },
     list: {
       type: "boolean",
       description: "List available voices and exit",
@@ -77,9 +95,13 @@ export default defineCommand({
     },
   },
   async run({ args }) {
+    // ── Resolve engine ────────────────────────────────────────────────
+    const engineId: EngineId = resolveEngine(args.engine, args.json);
+    const engine = await getEngine(engineId);
+
     // ── List voices mode ──────────────────────────────────────────────
     if (args.list) {
-      return listVoices(args.json);
+      return listVoices(engine, args.json);
     }
 
     // ── Resolve input text ────────────────────────────────────────────
@@ -106,58 +128,58 @@ export default defineCommand({
       process.exit(1);
     }
 
-    // ── Resolve output path ───────────────────────────────────────────
+    // ── Resolve output path & params ──────────────────────────────────
     const output = resolve(args.output ?? "speech.wav");
-    const voice = args.voice ?? DEFAULT_VOICE;
-    const speed = args.speed ? parseFloat(args.speed) : 1.0;
+    const voice = args.voice ?? engine.defaultVoice;
+    const speed = args.speed ? parseFloat(args.speed) : undefined;
 
-    if (isNaN(speed) || speed <= 0 || speed > 3) {
+    if (speed !== undefined && (isNaN(speed) || speed <= 0 || speed > 3)) {
       console.error(c.error("Speed must be a number between 0.1 and 3.0"));
       process.exit(1);
     }
 
-    const inferredLang = inferLangFromVoiceId(voice);
-    let lang: SupportedLang = inferredLang;
-    if (args.lang != null) {
-      const requested = String(args.lang).toLowerCase();
-      if (!isSupportedLang(requested)) {
-        errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
+    let steps: number | undefined;
+    if (args.steps != null) {
+      steps = parseInt(args.steps, 10);
+      if (isNaN(steps) || steps < 1 || steps > 64) {
+        console.error(c.error("Steps must be an integer between 1 and 64"));
         process.exit(1);
       }
-      lang = requested;
     }
 
-    // Mismatched voice/lang is a valid stylization (English text, French
-    // phonemization for accent), so this is a hint, not an error.
-    if (!args.json && args.lang != null && lang !== inferredLang) {
-      console.log(
-        c.dim(
-          `  Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
-        ),
-      );
+    // ── Resolve language (engine validates its own codes) ─────────────
+    let lang: string;
+    try {
+      lang = engine.resolveLang(voice, args.lang ?? undefined);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      errorBox("Invalid --lang", message);
+      process.exit(1);
     }
 
     // ── Synthesize ────────────────────────────────────────────────────
-    const { synthesize } = await import("../tts/synthesize.js");
     const spin = args.json ? null : clack.spinner();
-    spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
+    spin?.start(`Generating speech with ${engine.label} · ${c.accent(voice)} (${lang})...`);
 
     try {
-      const result = await synthesize(text, output, {
+      const result = await engine.synthesize(text, output, {
         voice,
         speed,
         lang,
+        steps,
         onProgress: spin ? (msg) => spin.message(msg) : undefined,
       });
 
       if (args.json) {
         console.log(
           JSON.stringify({
             ok: true,
+            engine: engine.id,
             voice,
-            speed,
+            speed: speed ?? null,
             lang,
             langApplied: result.langApplied,
+            sampleRate: result.sampleRate,
             durationSeconds: result.durationSeconds,
             outputPath: result.outputPath,
           }),
@@ -171,7 +193,7 @@ export default defineCommand({
         if (args.lang != null && !result.langApplied) {
           console.log(
             c.dim(
-              "  Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
+              "  Note: installed engine version does not support the --lang option; default phonemization was used.",
             ),
           );
         }
@@ -189,33 +211,41 @@ export default defineCommand({
 });
 
 // ---------------------------------------------------------------------------
-// List voices
+// Helpers
 // ---------------------------------------------------------------------------
 
-function listVoices(json: boolean): void {
-  const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
+function resolveEngine(value: string | undefined, json: boolean): EngineId {
+  if (value == null) return DEFAULT_ENGINE;
+  const normalized = String(value).toLowerCase();
+  if (!isEngineId(normalized)) {
+    const message = `Got "${value}". Must be one of: ${engineList}.`;
+    if (json) console.log(JSON.stringify({ ok: false, error: `Invalid --engine. ${message}` }));
+    else errorBox("Invalid --engine", message);
+    process.exit(1);
+  }
+  return normalized;
+}
+
+function listVoices(engine: TtsEngine, json: boolean): void {
+  const voices = engine.listVoices();
+  const rows = voices.map((v) => ({ ...v, defaultLang: engine.resolveLang(v.id) }));
 
   if (json) {
-    console.log(JSON.stringify(rows));
+    console.log(JSON.stringify({ engine: engine.id, voices: rows }));
     return;
   }
 
-  console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
+  console.log(`\n${c.bold("Available voices")} (${engine.label})\n`);
   console.log(
-    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Lang code")}  ${c.dim("Gender")}`,
+    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}      ${c.dim("Lang code")}  ${c.dim("Gender")}`,
   );
-  console.log(`  ${c.dim("─".repeat(72))}`);
+  console.log(`  ${c.dim("─".repeat(76))}`);
   for (const row of rows) {
     const id = row.id.padEnd(18);
     const label = row.label.padEnd(13);
-    const lang = row.language.padEnd(10);
+    const lang = row.language.padEnd(13);
     const code = row.defaultLang.padEnd(10);
     console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
   }
-  console.log(
-    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
-  );
-  console.log(
-    `  ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
-  );
+  console.log(`\n  ${c.dim(`Supported --lang codes: ${engine.supportedLangs.join(", ")}`)}\n`);
 }
diff --git a/packages/cli/src/tts/engine.ts b/packages/cli/src/tts/engine.ts
@@ -0,0 +1,93 @@
+// ---------------------------------------------------------------------------
+// TTS engine abstraction
+//
+// HyperFrames ships more than one text-to-speech backend. Kokoro-82M runs
+// through a Python subprocess (kokoro-onnx); Supertonic 3 runs fully
+// in-process via onnxruntime-node. Both implement the `TtsEngine` interface
+// below so `hyperframes tts` can switch between them with `--engine` without
+// the command knowing anything engine-specific.
+// ---------------------------------------------------------------------------
+
+export interface SynthesizeResult {
+  outputPath: string;
+  sampleRate: number;
+  durationSeconds: number;
+  /**
+   * Whether the requested phonemizer/language was actually applied. Kokoro
+   * may ignore `lang` on older installs; Supertonic always honors it.
+   */
+  langApplied: boolean;
+}
+
+export interface TtsVoice {
+  id: string;
+  label: string;
+  /** Human-readable language or locale label, e.g. "en-US" or "Multilingual". */
+  language: string;
+  gender: "female" | "male";
+}
+
+export interface EngineSynthesizeOptions {
+  voice?: string;
+  /** Speech speed multiplier (engine-specific sane range; ~0.1–3.0). */
+  speed?: number;
+  /** Phonemizer/synthesis language code in the engine's own vocabulary. */
+  lang?: string;
+  /**
+   * Flow-matching denoise steps (Supertonic only, ~4–12). Ignored by engines
+   * that don't expose iterative sampling.
+   */
+  steps?: number;
+  onProgress?: (message: string) => void;
+}
+
+export interface TtsEngine {
+  /** Stable identifier used by the `--engine` flag. */
+  readonly id: EngineId;
+  /** Display name shown in help and voice listings. */
+  readonly label: string;
+  /** Voice ID used when the caller doesn't pass one. */
+  readonly defaultVoice: string;
+  /** Voices this engine exposes for `tts --list`. */
+  listVoices(): TtsVoice[];
+  /**
+   * Resolve the language code for a given voice + optional explicit `--lang`.
+   * Returns the code to pass to `synthesize`. Throws if `requested` is not a
+   * valid code for this engine.
+   */
+  resolveLang(voice: string, requested?: string): string;
+  /** Supported language codes, for help text and validation messages. */
+  readonly supportedLangs: readonly string[];
+  synthesize(
+    text: string,
+    outputPath: string,
+    options?: EngineSynthesizeOptions,
+  ): Promise<SynthesizeResult>;
+}
+
+export type EngineId = "kokoro" | "supertonic";
+
+export const ENGINE_IDS = ["kokoro", "supertonic"] as const;
+
+export const DEFAULT_ENGINE: EngineId = "kokoro";
+
+export function isEngineId(value: string): value is EngineId {
+  return (ENGINE_IDS as readonly string[]).includes(value);
+}
+
+/**
+ * Lazily construct a TTS engine by id. Engines are imported on demand so the
+ * CLI doesn't load onnxruntime-node (or probe for Python) until TTS is used.
+ */
+export async function getEngine(id: EngineId): Promise<TtsEngine> {
+  switch (id) {
+    case "kokoro": {
+      const { KokoroEngine } = await import("./engines/kokoro.js");
+      return new KokoroEngine();
+    }
+    case "supertonic": {
+      const { SupertonicEngine } = await import("./engines/supertonic/index.js");
+      return new SupertonicEngine();
+    }
+  }
+}