diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
index bf0bc03c1..db505f552 100644
--- a/packages/cli/src/commands/tts.ts
+++ b/packages/cli/src/commands/tts.ts
@@ -8,7 +8,15 @@ export const examples: Example[] = [
   ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
   ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
   [
-    "Generate Spanish speech",
+    "Use the Supertonic engine",
+    'hyperframes tts "Lightning-fast on-device speech" --engine supertonic --voice F1',
+  ],
+  [
+    "Supertonic in another language",
+    'hyperframes tts "안녕하세요" --engine supertonic --voice F2 --lang ko',
+  ],
+  [
+    "Generate Spanish speech (Kokoro)",
     'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
   ],
   [
@@ -17,27 +25,28 @@ export const examples: Example[] = [
   ],
   ["Read text from a file", "hyperframes tts script.txt"],
   ["List available voices", "hyperframes tts --list"],
+  ["List Supertonic voices", "hyperframes tts --list --engine supertonic"],
 ];
 import { resolve, extname } from "node:path";
 import * as clack from "@clack/prompts";
 import { c } from "../ui/colors.js";
 import { errorBox } from "../ui/format.js";
 import {
-  DEFAULT_VOICE,
-  BUNDLED_VOICES,
-  SUPPORTED_LANGS,
-  inferLangFromVoiceId,
-  isSupportedLang,
-  type SupportedLang,
-} from "../tts/manager.js";
+  DEFAULT_ENGINE,
+  ENGINE_IDS,
+  getEngine,
+  isEngineId,
+  type EngineId,
+  type TtsEngine,
+} from "../tts/engine.js";
 
-const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
-const langList = SUPPORTED_LANGS.join(", ");
+const engineList = ENGINE_IDS.join(", ");
 
 export default defineCommand({
   meta: {
     name: "tts",
-    description: "Generate speech audio from text using a local AI model (Kokoro-82M)",
+    description:
+      "Generate speech audio from text using a local AI model (Kokoro-82M or Supertonic 3)",
   },
   args: {
     input: {
@@ -45,6 +54,11 @@ export default defineCommand({
       description: "Text to speak, or path to a .txt file",
       required: false,
     },
+    engine: {
+      type: "string",
+      description: `TTS engine (default: ${DEFAULT_ENGINE}). Options: ${engineList}`,
+      alias: "e",
+    },
     output: {
       type: "string",
       description: "Output file path (default: speech.wav in current directory)",
@@ -52,7 +66,7 @@ export default defineCommand({
     },
     voice: {
       type: "string",
-      description: `Voice ID (default: ${DEFAULT_VOICE}). Options: ${voiceList}`,
+      description: "Voice ID (engine-specific; see --list)",
       alias: "v",
     },
     speed: {
@@ -62,9 +76,13 @@ export default defineCommand({
     },
     lang: {
       type: "string",
-      description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
+      description: "Synthesis language (engine-specific; see --list)",
       alias: "l",
     },
+    steps: {
+      type: "string",
+      description: "Supertonic only: flow-matching denoise steps (default: 8, higher = slower)",
+    },
     list: {
       type: "boolean",
       description: "List available voices and exit",
@@ -77,9 +95,13 @@ export default defineCommand({
     },
   },
   async run({ args }) {
+    // ── Resolve engine ────────────────────────────────────────────────
+    const engineId: EngineId = resolveEngine(args.engine, args.json);
+    const engine = await getEngine(engineId);
+
     // ── List voices mode ──────────────────────────────────────────────
     if (args.list) {
-      return listVoices(args.json);
+      return listVoices(engine, args.json);
     }
 
     // ── Resolve input text ────────────────────────────────────────────
@@ -106,47 +128,45 @@ export default defineCommand({
       process.exit(1);
     }
 
-    // ── Resolve output path ───────────────────────────────────────────
+    // ── Resolve output path & params ──────────────────────────────────
     const output = resolve(args.output ?? "speech.wav");
-    const voice = args.voice ?? DEFAULT_VOICE;
-    const speed = args.speed ? parseFloat(args.speed) : 1.0;
+    const voice = args.voice ?? engine.defaultVoice;
+    const speed = args.speed ? parseFloat(args.speed) : undefined;
 
-    if (isNaN(speed) || speed <= 0 || speed > 3) {
+    if (speed !== undefined && (isNaN(speed) || speed <= 0 || speed > 3)) {
       console.error(c.error("Speed must be a number between 0.1 and 3.0"));
       process.exit(1);
     }
 
-    const inferredLang = inferLangFromVoiceId(voice);
-    let lang: SupportedLang = inferredLang;
-    if (args.lang != null) {
-      const requested = String(args.lang).toLowerCase();
-      if (!isSupportedLang(requested)) {
-        errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
+    let steps: number | undefined;
+    if (args.steps != null) {
+      steps = parseInt(args.steps, 10);
+      if (isNaN(steps) || steps < 1 || steps > 64) {
+        console.error(c.error("Steps must be an integer between 1 and 64"));
         process.exit(1);
       }
-      lang = requested;
     }
 
-    // Mismatched voice/lang is a valid stylization (English text, French
-    // phonemization for accent), so this is a hint, not an error.
-    if (!args.json && args.lang != null && lang !== inferredLang) {
-      console.log(
-        c.dim(
-          `  Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
-        ),
-      );
+    // ── Resolve language (engine validates its own codes) ─────────────
+    let lang: string;
+    try {
+      lang = engine.resolveLang(voice, args.lang ?? undefined);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      errorBox("Invalid --lang", message);
+      process.exit(1);
     }
 
     // ── Synthesize ────────────────────────────────────────────────────
-    const { synthesize } = await import("../tts/synthesize.js");
     const spin = args.json ? null : clack.spinner();
-    spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
+    spin?.start(`Generating speech with ${engine.label} · ${c.accent(voice)} (${lang})...`);
 
     try {
-      const result = await synthesize(text, output, {
+      const result = await engine.synthesize(text, output, {
         voice,
         speed,
         lang,
+        steps,
         onProgress: spin ? (msg) => spin.message(msg) : undefined,
       });
 
@@ -154,10 +174,12 @@ export default defineCommand({
         console.log(
           JSON.stringify({
             ok: true,
+            engine: engine.id,
             voice,
-            speed,
+            speed: speed ?? null,
             lang,
             langApplied: result.langApplied,
+            sampleRate: result.sampleRate,
             durationSeconds: result.durationSeconds,
             outputPath: result.outputPath,
           }),
@@ -171,7 +193,7 @@ export default defineCommand({
         if (args.lang != null && !result.langApplied) {
           console.log(
             c.dim(
-              "  Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
+              "  Note: installed engine version does not support the --lang option; default phonemization was used.",
             ),
           );
         }
@@ -189,33 +211,41 @@ export default defineCommand({
 });
 
 // ---------------------------------------------------------------------------
-// List voices
+// Helpers
 // ---------------------------------------------------------------------------
 
-function listVoices(json: boolean): void {
-  const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
+function resolveEngine(value: string | undefined, json: boolean): EngineId {
+  if (value == null) return DEFAULT_ENGINE;
+  const normalized = String(value).toLowerCase();
+  if (!isEngineId(normalized)) {
+    const message = `Got "${value}". Must be one of: ${engineList}.`;
+    if (json) console.log(JSON.stringify({ ok: false, error: `Invalid --engine. ${message}` }));
+    else errorBox("Invalid --engine", message);
+    process.exit(1);
+  }
+  return normalized;
+}
+
+function listVoices(engine: TtsEngine, json: boolean): void {
+  const voices = engine.listVoices();
+  const rows = voices.map((v) => ({ ...v, defaultLang: engine.resolveLang(v.id) }));
 
   if (json) {
-    console.log(JSON.stringify(rows));
+    console.log(JSON.stringify({ engine: engine.id, voices: rows }));
     return;
   }
 
-  console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
+  console.log(`\n${c.bold("Available voices")} (${engine.label})\n`);
   console.log(
-    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Lang code")}  ${c.dim("Gender")}`,
+    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}      ${c.dim("Lang code")}  ${c.dim("Gender")}`,
   );
-  console.log(`  ${c.dim("─".repeat(72))}`);
+  console.log(`  ${c.dim("─".repeat(76))}`);
   for (const row of rows) {
     const id = row.id.padEnd(18);
     const label = row.label.padEnd(13);
-    const lang = row.language.padEnd(10);
+    const lang = row.language.padEnd(13);
     const code = row.defaultLang.padEnd(10);
     console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
   }
-  console.log(
-    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
-  );
-  console.log(
-    `  ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
-  );
+  console.log(`\n  ${c.dim(`Supported --lang codes: ${engine.supportedLangs.join(", ")}`)}\n`);
 }
diff --git a/packages/cli/src/tts/engine.ts b/packages/cli/src/tts/engine.ts
new file mode 100644
index 000000000..93427231a
--- /dev/null
+++ b/packages/cli/src/tts/engine.ts
@@ -0,0 +1,93 @@
+// ---------------------------------------------------------------------------
+// TTS engine abstraction
+//
+// HyperFrames ships more than one text-to-speech backend. Kokoro-82M runs
+// through a Python subprocess (kokoro-onnx); Supertonic 3 runs fully
+// in-process via onnxruntime-node. Both implement the `TtsEngine` interface
+// below so `hyperframes tts` can switch between them with `--engine` without
+// the command knowing anything engine-specific.
+// ---------------------------------------------------------------------------
+
+export interface SynthesizeResult {
+  outputPath: string;
+  sampleRate: number;
+  durationSeconds: number;
+  /**
+   * Whether the requested phonemizer/language was actually applied. Kokoro
+   * may ignore `lang` on older installs; Supertonic always honors it.
+   */
+  langApplied: boolean;
+}
+
+export interface TtsVoice {
+  id: string;
+  label: string;
+  /** Human-readable language or locale label, e.g. "en-US" or "Multilingual". */
+  language: string;
+  gender: "female" | "male";
+}
+
+export interface EngineSynthesizeOptions {
+  voice?: string;
+  /** Speech speed multiplier (engine-specific sane range; ~0.1–3.0). */
+  speed?: number;
+  /** Phonemizer/synthesis language code in the engine's own vocabulary. */
+  lang?: string;
+  /**
+   * Flow-matching denoise steps (Supertonic only, ~4–12). Ignored by engines
+   * that don't expose iterative sampling.
+   */
+  steps?: number;
+  onProgress?: (message: string) => void;
+}
+
+export interface TtsEngine {
+  /** Stable identifier used by the `--engine` flag. */
+  readonly id: EngineId;
+  /** Display name shown in help and voice listings. */
+  readonly label: string;
+  /** Voice ID used when the caller doesn't pass one. */
+  readonly defaultVoice: string;
+  /** Voices this engine exposes for `tts --list`. */
+  listVoices(): TtsVoice[];
+  /**
+   * Resolve the language code for a given voice + optional explicit `--lang`.
+   * Returns the code to pass to `synthesize`. Throws if `requested` is not a
+   * valid code for this engine.
+   */
+  resolveLang(voice: string, requested?: string): string;
+  /** Supported language codes, for help text and validation messages. */
+  readonly supportedLangs: readonly string[];
+  synthesize(
+    text: string,
+    outputPath: string,
+    options?: EngineSynthesizeOptions,
+  ): Promise<SynthesizeResult>;
+}
+
+export type EngineId = "kokoro" | "supertonic";
+
+export const ENGINE_IDS = ["kokoro", "supertonic"] as const;
+
+export const DEFAULT_ENGINE: EngineId = "kokoro";
+
+export function isEngineId(value: string): value is EngineId {
+  return (ENGINE_IDS as readonly string[]).includes(value);
+}
+
+/**
+ * Lazily construct a TTS engine by id. Engines are imported on demand so the
+ * CLI doesn't load onnxruntime-node (or probe for Python) until TTS is used.
+ */
+export async function getEngine(id: EngineId): Promise<TtsEngine> {
+  switch (id) {
+    case "kokoro": {
+      const { KokoroEngine } = await import("./engines/kokoro.js");
+      return new KokoroEngine();
+    }
+    case "supertonic": {
+      const { SupertonicEngine } = await import("./engines/supertonic/index.js");
+      return new SupertonicEngine();
+    }
+  }
+}
diff --git a/packages/cli/src/tts/engines/kokoro.ts b/packages/cli/src/tts/engines/kokoro.ts
new file mode 100644
index 000000000..1d23ae6e0
--- /dev/null
+++ b/packages/cli/src/tts/engines/kokoro.ts
@@ -0,0 +1,61 @@
+// Kokoro-82M engine — a thin adapter over the existing kokoro-onnx Python
+// pipeline (../synthesize.ts, ../manager.ts). Behavior is unchanged; this just
+// exposes it through the shared TtsEngine interface.
+
+import type {
+  EngineId,
+  EngineSynthesizeOptions,
+  SynthesizeResult,
+  TtsEngine,
+  TtsVoice,
+} from "../engine.js";
+import { synthesize } from "../synthesize.js";
+import {
+  BUNDLED_VOICES,
+  DEFAULT_VOICE,
+  SUPPORTED_LANGS,
+  inferLangFromVoiceId,
+  isSupportedLang,
+  type SupportedLang,
+} from "../manager.js";
+
+export class KokoroEngine implements TtsEngine {
+  readonly id: EngineId = "kokoro";
+  readonly label = "Kokoro-82M";
+  readonly defaultVoice = DEFAULT_VOICE;
+  readonly supportedLangs = SUPPORTED_LANGS;
+
+  listVoices(): TtsVoice[] {
+    return BUNDLED_VOICES.map((v) => ({
+      id: v.id,
+      label: v.label,
+      language: v.language,
+      gender: v.gender,
+    }));
+  }
+
+  resolveLang(voice: string, requested?: string): string {
+    const inferred = inferLangFromVoiceId(voice);
+    if (requested == null) return inferred;
+    const normalized = requested.toLowerCase();
+    if (!isSupportedLang(normalized)) {
+      throw new Error(
+        `Invalid --lang "${requested}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`,
+      );
+    }
+    return normalized;
+  }
+
+  synthesize(
+    text: string,
+    outputPath: string,
+    options?: EngineSynthesizeOptions,
+  ): Promise<SynthesizeResult> {
+    return synthesize(text, outputPath, {
+      voice: options?.voice,
+      speed: options?.speed,
+      lang: options?.lang as SupportedLang | undefined,
+      onProgress: options?.onProgress,
+    });
+  }
+}
diff --git a/packages/cli/src/tts/engines/supertonic/index.ts b/packages/cli/src/tts/engines/supertonic/index.ts
new file mode 100644
index 000000000..e8eaf635d
--- /dev/null
+++ b/packages/cli/src/tts/engines/supertonic/index.ts
@@ -0,0 +1,118 @@
+// Supertonic 3 engine — on-device multilingual TTS via onnxruntime-node.
+// Unlike the Kokoro engine, this runs the full pipeline in-process (no Python).
+
+import { dirname } from "node:path";
+import { mkdirSync, existsSync } from "node:fs";
+import type {
+  EngineId,
+  EngineSynthesizeOptions,
+  SynthesizeResult,
+  TtsEngine,
+  TtsVoice,
+} from "../../engine.js";
+import {
+  DEFAULT_VOICE,
+  ensureModels,
+  ensureVoice,
+  isSupertonicVoice,
+  type SupertonicVoiceId,
+} from "./manager.js";
+import { SUPPORTED_LANGS, isSupertonicLang } from "./runtime.js";
+
+const DEFAULT_LANG = "en";
+const DEFAULT_STEPS = 8;
+
+// Preset voices shipped on Hugging Face. Supertonic styles are multilingual —
+// the speaker identity is independent of the synthesis language (passed via
+// --lang), so language is labelled "Multilingual".
+const VOICES: TtsVoice[] = [
+  { id: "F1", label: "Female 1", language: "Multilingual", gender: "female" },
+  { id: "F2", label: "Female 2", language: "Multilingual", gender: "female" },
+  { id: "F3", label: "Female 3", language: "Multilingual", gender: "female" },
+  { id: "F4", label: "Female 4", language: "Multilingual", gender: "female" },
+  { id: "F5", label: "Female 5", language: "Multilingual", gender: "female" },
+  { id: "M1", label: "Male 1", language: "Multilingual", gender: "male" },
+  { id: "M2", label: "Male 2", language: "Multilingual", gender: "male" },
+  { id: "M3", label: "Male 3", language: "Multilingual", gender: "male" },
+  { id: "M4", label: "Male 4", language: "Multilingual", gender: "male" },
+  { id: "M5", label: "Male 5", language: "Multilingual", gender: "male" },
+];
+
+export class SupertonicEngine implements TtsEngine {
+  readonly id: EngineId = "supertonic";
+  readonly label = "Supertonic 3";
+  readonly defaultVoice = DEFAULT_VOICE;
+  readonly supportedLangs = SUPPORTED_LANGS;
+
+  listVoices(): TtsVoice[] {
+    return VOICES;
+  }
+
+  resolveLang(_voice: string, requested?: string): string {
+    if (requested == null) return DEFAULT_LANG;
+    const normalized = requested.toLowerCase();
+    if (!isSupertonicLang(normalized)) {
+      throw new Error(
+        `Invalid --lang "${requested}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`,
+      );
+    }
+    return normalized;
+  }
+
+  async synthesize(
+    text: string,
+    outputPath: string,
+    options?: EngineSynthesizeOptions,
+  ): Promise<SynthesizeResult> {
+    const voiceId = options?.voice ?? DEFAULT_VOICE;
+    if (!isSupertonicVoice(voiceId)) {
+      throw new Error(
+        `Unknown Supertonic voice "${voiceId}". Options: ${VOICES.map((v) => v.id).join(", ")}.`,
+      );
+    }
+    const voice: SupertonicVoiceId = voiceId;
+
+    const speed = options?.speed ?? 1.05;
+    const lang = options?.lang ?? DEFAULT_LANG;
+    const steps = options?.steps ?? DEFAULT_STEPS;
+    if (!isSupertonicLang(lang)) {
+      throw new Error(`Invalid language "${lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`);
+    }
+
+    // 1. Ensure assets are downloaded (models once, voice once).
+    const [onnxDir, voicePath] = await Promise.all([
+      ensureModels({ onProgress: options?.onProgress }),
+      ensureVoice(voice, { onProgress: options?.onProgress }),
+    ]);
+
+    // 2. Load the ONNX pipeline and the selected voice style.
+    options?.onProgress?.("Loading Supertonic models...");
+    const { loadTextToSpeech, loadVoiceStyle, writeWavFile } = await import("./runtime.js");
+    const tts = await loadTextToSpeech(onnxDir);
+    const style = loadVoiceStyle([voicePath]);
+
+    // 3. Synthesize.
+    options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`);
+    const { wav, duration } = await tts.call(text, lang, style, steps, speed);
+
+    // Trim trailing padding to the predicted duration, matching the upstream
+    // example's per-item slice.
+    const durationSeconds = duration[0] ?? 0;
+    const sampleCount = Math.floor(tts.sampleRate * durationSeconds);
+    const samples = sampleCount > 0 ? wav.slice(0, sampleCount) : wav;
+
+    mkdirSync(dirname(outputPath), { recursive: true });
+    writeWavFile(outputPath, samples, tts.sampleRate);
+
+    if (!existsSync(outputPath)) {
+      throw new Error("Synthesis completed but no output file was created");
+    }
+
+    return {
+      outputPath,
+      sampleRate: tts.sampleRate,
+      durationSeconds: Math.round(durationSeconds * 1000) / 1000,
+      langApplied: true,
+    };
+  }
+}
diff --git a/packages/cli/src/tts/engines/supertonic/manager.ts b/packages/cli/src/tts/engines/supertonic/manager.ts
new file mode 100644
index 000000000..f158466d0
--- /dev/null
+++ b/packages/cli/src/tts/engines/supertonic/manager.ts
@@ -0,0 +1,88 @@
+// Supertonic 3 asset manager — downloads the ONNX models, config, and preset
+// voice styles from Hugging Face on first use and caches them under
+// ~/.cache/hyperframes/tts/supertonic/. Mirrors the Kokoro manager's
+// download-on-demand pattern (../../manager.ts).
+
+import { existsSync, mkdirSync } from "node:fs";
+import { homedir } from "node:os";
+import { join } from "node:path";
+import { downloadFile } from "../../../utils/download.js";
+
+const CACHE_DIR = join(homedir(), ".cache", "hyperframes", "tts", "supertonic");
+const ONNX_DIR = join(CACHE_DIR, "onnx");
+const VOICES_DIR = join(CACHE_DIR, "voice_styles");
+
+// Repo layout: https://huggingface.co/Supertone/supertonic-3 cloned into the
+// `assets/` dir the upstream examples expect, so `onnx/` and `voice_styles/`
+// are top-level there. `resolve/main/<path>` serves the raw (LFS) bytes.
+const HF_BASE = "https://huggingface.co/Supertone/supertonic-3/resolve/main";
+
+// Files the inference pipeline loads from the onnx dir (see runtime.ts).
+const ONNX_FILES = [
+  "duration_predictor.onnx",
+  "text_encoder.onnx",
+  "vector_estimator.onnx",
+  "vocoder.onnx",
+  "tts.json",
+  "unicode_indexer.json",
+] as const;
+
+// Preset speaker embeddings. Small JSON files (~KB each).
+const VOICE_FILES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"] as const;
+
+export type SupertonicVoiceId = (typeof VOICE_FILES)[number];
+
+export const DEFAULT_VOICE: SupertonicVoiceId = "F1";
+
+/**
+ * Ensure all ONNX models + config are present. Returns the directory path to
+ * pass to `loadTextToSpeech`. Downloads any missing files (the .onnx models
+ * total a few hundred MB; downloaded once, then cached).
+ */
+export async function ensureModels(options?: {
+  onProgress?: (message: string) => void;
+}): Promise<string> {
+  mkdirSync(ONNX_DIR, { recursive: true });
+
+  const missing = ONNX_FILES.filter((f) => !existsSync(join(ONNX_DIR, f)));
+  if (missing.length === 0) return ONNX_DIR;
+
+  options?.onProgress?.(
+    `Downloading Supertonic models (${missing.length} file${missing.length === 1 ? "" : "s"}, ~300 MB on first run)...`,
+  );
+
+  // Sequential to keep progress legible and avoid hammering the CDN.
+  for (const file of missing) {
+    const dest = join(ONNX_DIR, file);
+    options?.onProgress?.(`Downloading ${file}...`);
+    await downloadFile(`${HF_BASE}/onnx/${file}`, dest);
+    if (!existsSync(dest)) {
+      throw new Error(`Supertonic model download failed: ${file}`);
+    }
+  }
+
+  return ONNX_DIR;
+}
+
+/**
+ * Ensure a single preset voice-style JSON is present and return its path.
+ */
+export async function ensureVoice(
+  voice: SupertonicVoiceId,
+  options?: { onProgress?: (message: string) => void },
+): Promise<string> {
+  mkdirSync(VOICES_DIR, { recursive: true });
+  const dest = join(VOICES_DIR, `${voice}.json`);
+  if (existsSync(dest)) return dest;
+
+  options?.onProgress?.(`Downloading voice ${voice}...`);
+  await downloadFile(`${HF_BASE}/voice_styles/${voice}.json`, dest);
+  if (!existsSync(dest)) {
+    throw new Error(`Supertonic voice download failed: ${voice}`);
+  }
+  return dest;
+}
+
+export function isSupertonicVoice(value: string): value is SupertonicVoiceId {
+  return (VOICE_FILES as readonly string[]).includes(value);
+}
diff --git a/packages/cli/src/tts/engines/supertonic/runtime.ts b/packages/cli/src/tts/engines/supertonic/runtime.ts
new file mode 100644
index 000000000..d4ad8bec4
--- /dev/null
+++ b/packages/cli/src/tts/engines/supertonic/runtime.ts
@@ -0,0 +1,550 @@
+// Supertonic 3 inference runtime — a faithful TypeScript port of the upstream
+// Node reference implementation (supertonic/nodejs/helper.js). The pipeline
+// runs entirely in-process via onnxruntime-node: no Python, no subprocess.
+//
+// Stages: Unicode tokenization → duration prediction → text encoding →
+// Gaussian latent sampling → iterative flow-matching denoise → vocoder.
+// The numeric logic mirrors upstream exactly; only types and ESM/TS idioms
+// were added. See https://github.com/supertone-inc/supertonic.
+
+import { readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import * as ort from "onnxruntime-node";
+
+const AVAILABLE_LANGS = [
+  "en",
+  "ko",
+  "ja",
+  "ar",
+  "bg",
+  "cs",
+  "da",
+  "de",
+  "el",
+  "es",
+  "et",
+  "fi",
+  "fr",
+  "hi",
+  "hr",
+  "hu",
+  "id",
+  "it",
+  "lt",
+  "lv",
+  "nl",
+  "pl",
+  "pt",
+  "ro",
+  "ru",
+  "sk",
+  "sl",
+  "sv",
+  "tr",
+  "uk",
+  "vi",
+  "na",
+] as const;
+
+export type SupertonicLang = (typeof AVAILABLE_LANGS)[number];
+
+export function isSupertonicLang(value: string): value is SupertonicLang {
+  return (AVAILABLE_LANGS as readonly string[]).includes(value);
+}
+
+export const SUPPORTED_LANGS = AVAILABLE_LANGS;
+
+// ---------------------------------------------------------------------------
+// Config & tensor helpers
+// ---------------------------------------------------------------------------
+
+interface TtsConfig {
+  ae: { sample_rate: number; base_chunk_size: number };
+  ttl: { chunk_compress_factor: number; latent_dim: number };
+}
+
+type Nested = number | Nested[];
+
+/** Recursively flatten a (possibly ragged) nested number array — `arr.flat(Infinity)`. */
+function flatten(arr: Nested[]): number[] {
+  const out: number[] = [];
+  const walk = (x: Nested): void => {
+    if (Array.isArray(x)) {
+      for (const item of x) walk(item);
+    } else {
+      out.push(x);
+    }
+  };
+  for (const item of arr) walk(item);
+  return out;
+}
+
+function arrayToTensor(array: Nested[], dims: number[]): ort.Tensor {
+  return new ort.Tensor("float32", Float32Array.from(flatten(array)), dims);
+}
+
+function intArrayToTensor(array: Nested[], dims: number[]): ort.Tensor {
+  const flat = flatten(array);
+  return new ort.Tensor("int64", BigInt64Array.from(flat.map((x) => BigInt(x))), dims);
+}
+
+function tensorToNumbers(t: ort.Tensor): number[] {
+  return Array.from(t.data as ArrayLike<number>);
+}
+
+/** Convert per-item lengths to a [B, 1, maxLen] binary mask. */
+function lengthToMask(lengths: number[], maxLen?: number): number[][][] {
+  const max = maxLen ?? Math.max(...lengths);
+  const mask: number[][][] = [];
+  for (const len of lengths) {
+    const row: number[] = [];
+    for (let j = 0; j < max; j++) {
+      row.push(j < len ? 1.0 : 0.0);
+    }
+    mask.push([row]); // [B, 1, maxLen]
+  }
+  return mask;
+}
+
+function getLatentMask(
+  wavLengths: number[],
+  baseChunkSize: number,
+  chunkCompressFactor: number,
+): number[][][] {
+  const latentSize = baseChunkSize * chunkCompressFactor;
+  const latentLengths = wavLengths.map((len) => Math.floor((len + latentSize - 1) / latentSize));
+  return lengthToMask(latentLengths);
+}
+
+// ---------------------------------------------------------------------------
+// Unicode text processing
+// ---------------------------------------------------------------------------
+
+class UnicodeProcessor {
+  private readonly indexer: Record<string, number>;
+
+  constructor(unicodeIndexerJsonPath: string) {
+    this.indexer = JSON.parse(readFileSync(unicodeIndexerJsonPath, "utf8"));
+  }
+
+  private preprocessText(text: string, lang: string): string {
+    text = text.normalize("NFKD");
+
+    // Remove emojis (wide Unicode range).
+    const emojiPattern =
+      /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
+    text = text.replace(emojiPattern, "");
+
+    const replacements: Record<string, string> = {
+      "–": "-",
+      "‑": "-",
+      "—": "-",
+      _: " ",
+      "“": '"',
+      "”": '"',
+      "‘": "'",
+      "’": "'",
+      "´": "'",
+      "`": "'",
+      "[": " ",
+      "]": " ",
+      "|": " ",
+      "/": " ",
+      "#": " ",
+      "→": " ",
+      "←": " ",
+    };
+    for (const [k, v] of Object.entries(replacements)) {
+      text = text.replaceAll(k, v);
+    }
+
+    text = text.replace(/[♥☆♡©\\]/g, "");
+
+    const exprReplacements: Record<string, string> = {
+      "@": " at ",
+      "e.g.,": "for example, ",
+      "i.e.,": "that is, ",
+    };
+    for (const [k, v] of Object.entries(exprReplacements)) {
+      text = text.replaceAll(k, v);
+    }
+
+    // Fix spacing around punctuation.
+    text = text.replace(/ ,/g, ",");
+    text = text.replace(/ \./g, ".");
+    text = text.replace(/ !/g, "!");
+    text = text.replace(/ \?/g, "?");
+    text = text.replace(/ ;/g, ";");
+    text = text.replace(/ :/g, ":");
+    text = text.replace(/ '/g, "'");
+
+    // Collapse duplicate quotes.
+    while (text.includes('""')) text = text.replace('""', '"');
+    while (text.includes("''")) text = text.replace("''", "'");
+    while (text.includes("``")) text = text.replace("``", "`");
+
+    text = text.replace(/\s+/g, " ").trim();
+
+    // Append a period if it doesn't already end with terminal punctuation.
+    if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) {
+      text += ".";
+    }
+
+    if (!AVAILABLE_LANGS.includes(lang as SupertonicLang)) {
+      throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(", ")}`);
+    }
+
+    return `<${lang}>${text}</${lang}>`;
+  }
+
+  private textToUnicodeValues(text: string): number[] {
+    return Array.from(text).map((char) => char.charCodeAt(0));
+  }
+
+  call(textList: string[], langList: string[]): { textIds: number[][]; textMask: number[][][] } {
+    const processedTexts = textList.map((t, i) => this.preprocessText(t, langList[i]!));
+    const textIdsLengths = processedTexts.map((t) => t.length);
+    const maxLen = Math.max(...textIdsLengths);
+
+    const textIds: number[][] = [];
+    for (const processed of processedTexts) {
+      const row = new Array<number>(maxLen).fill(0);
+      const unicodeVals = this.textToUnicodeValues(processed);
+      for (let j = 0; j < unicodeVals.length; j++) {
+        row[j] = this.indexer[String(unicodeVals[j])] ?? 0;
+      }
+      textIds.push(row);
+    }
+
+    const textMask = lengthToMask(textIdsLengths);
+    return { textIds, textMask };
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Voice style
+// ---------------------------------------------------------------------------
+
+// Exported as the return type of loadVoiceStyle (required for declaration emit).
+// fallow-ignore-next-line unused-exports
+export class Style {
+  constructor(
+    readonly ttl: ort.Tensor,
+    readonly dp: ort.Tensor,
+  ) {}
+}
+
+interface VoiceStyleJson {
+  style_ttl: { dims: number[]; data: Nested[] };
+  style_dp: { dims: number[]; data: Nested[] };
+}
+
+/**
+ * Load one or more preset voice-style JSON files into a batched Style. All
+ * files must share the same tensor dimensions (they do, for v3 presets).
+ */
+export function loadVoiceStyle(voiceStylePaths: string[]): Style {
+  const bsz = voiceStylePaths.length;
+
+  const first: VoiceStyleJson = JSON.parse(readFileSync(voiceStylePaths[0]!, "utf8"));
+  const ttlDims = first.style_ttl.dims;
+  const dpDims = first.style_dp.dims;
+
+  const ttlDim1 = ttlDims[1]!;
+  const ttlDim2 = ttlDims[2]!;
+  const dpDim1 = dpDims[1]!;
+  const dpDim2 = dpDims[2]!;
+
+  const ttlFlat = new Float32Array(bsz * ttlDim1 * ttlDim2);
+  const dpFlat = new Float32Array(bsz * dpDim1 * dpDim2);
+
+  for (let i = 0; i < bsz; i++) {
+    const voiceStyle: VoiceStyleJson = JSON.parse(readFileSync(voiceStylePaths[i]!, "utf8"));
+    ttlFlat.set(flatten(voiceStyle.style_ttl.data), i * ttlDim1 * ttlDim2);
+    dpFlat.set(flatten(voiceStyle.style_dp.data), i * dpDim1 * dpDim2);
+  }
+
+  const ttlStyle = new ort.Tensor("float32", ttlFlat, [bsz, ttlDim1, ttlDim2]);
+  const dpStyle = new ort.Tensor("float32", dpFlat, [bsz, dpDim1, dpDim2]);
+  return new Style(ttlStyle, dpStyle);
+}
+
+// ---------------------------------------------------------------------------
+// TextToSpeech pipeline
+// ---------------------------------------------------------------------------
+
+// Exported as the return type of loadTextToSpeech (required for declaration emit).
+// fallow-ignore-next-line unused-exports
+export class TextToSpeech {
+  readonly sampleRate: number;
+  private readonly baseChunkSize: number;
+  private readonly chunkCompressFactor: number;
+  private readonly ldim: number;
+
+  constructor(
+    cfgs: TtsConfig,
+    private readonly textProcessor: UnicodeProcessor,
+    private readonly dpOrt: ort.InferenceSession,
+    private readonly textEncOrt: ort.InferenceSession,
+    private readonly vectorEstOrt: ort.InferenceSession,
+    private readonly vocoderOrt: ort.InferenceSession,
+  ) {
+    this.sampleRate = cfgs.ae.sample_rate;
+    this.baseChunkSize = cfgs.ae.base_chunk_size;
+    this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor;
+    this.ldim = cfgs.ttl.latent_dim;
+  }
+
+  private sampleNoisyLatent(duration: number[]): {
+    noisyLatent: number[][][];
+    latentMask: number[][][];
+  } {
+    const wavLenMax = Math.max(...duration) * this.sampleRate;
+    const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate));
+    const chunkSize = this.baseChunkSize * this.chunkCompressFactor;
+    const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
+    const latentDim = this.ldim * this.chunkCompressFactor;
+
+    const noisyLatent: number[][][] = [];
+    for (let b = 0; b < duration.length; b++) {
+      const batch: number[][] = [];
+      for (let d = 0; d < latentDim; d++) {
+        const row: number[] = [];
+        for (let t = 0; t < latentLen; t++) {
+          // Box-Muller transform for a standard normal sample.
+          const eps = 1e-10;
+          const u1 = Math.max(eps, Math.random());
+          const u2 = Math.random();
+          row.push(Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2));
+        }
+        batch.push(row);
+      }
+      noisyLatent.push(batch);
+    }
+
+    const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor);
+
+    for (let b = 0; b < noisyLatent.length; b++) {
+      for (let d = 0; d < noisyLatent[b]!.length; d++) {
+        for (let t = 0; t < noisyLatent[b]![d]!.length; t++) {
+          noisyLatent[b]![d]![t]! *= latentMask[b]![0]![t]!;
+        }
+      }
+    }
+
+    return { noisyLatent, latentMask };
+  }
+
+  private async infer(
+    textList: string[],
+    langList: string[],
+    style: Style,
+    totalStep: number,
+    speed = 1.05,
+  ): Promise<{ wav: number[]; duration: number[] }> {
+    if (textList.length !== style.ttl.dims[0]) {
+      throw new Error("Number of texts must match number of style vectors");
+    }
+    const bsz = textList.length;
+    const { textIds, textMask } = this.textProcessor.call(textList, langList);
+    const textIdsShape = [bsz, textIds[0]!.length];
+    const textMaskShape = [bsz, 1, textMask[0]![0]!.length];
+
+    const textMaskTensor = arrayToTensor(textMask, textMaskShape);
+
+    const dpResult = await this.dpOrt.run({
+      text_ids: intArrayToTensor(textIds, textIdsShape),
+      style_dp: style.dp,
+      text_mask: textMaskTensor,
+    });
+
+    const durOnnx = tensorToNumbers(dpResult.duration!);
+    // Faster speech → shorter duration.
+    for (let i = 0; i < durOnnx.length; i++) {
+      durOnnx[i]! /= speed;
+    }
+
+    const textEncResult = await this.textEncOrt.run({
+      text_ids: intArrayToTensor(textIds, textIdsShape),
+      style_ttl: style.ttl,
+      text_mask: textMaskTensor,
+    });
+    const textEmbTensor = textEncResult.text_emb!;
+
+    const { noisyLatent, latentMask } = this.sampleNoisyLatent(durOnnx);
+    const latentShape = [bsz, noisyLatent[0]!.length, noisyLatent[0]![0]!.length];
+    const latentMaskShape = [bsz, 1, latentMask[0]![0]!.length];
+
+    const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape);
+
+    const totalStepTensor = arrayToTensor(new Array<number>(bsz).fill(totalStep), [bsz]);
+
+    for (let step = 0; step < totalStep; step++) {
+      const currentStepArray = new Array<number>(bsz).fill(step);
+
+      const vectorEstResult = await this.vectorEstOrt.run({
+        noisy_latent: arrayToTensor(noisyLatent, latentShape),
+        text_emb: textEmbTensor,
+        style_ttl: style.ttl,
+        text_mask: textMaskTensor,
+        latent_mask: latentMaskTensor,
+        total_step: totalStepTensor,
+        current_step: arrayToTensor(currentStepArray, [bsz]),
+      });
+
+      const denoisedLatent = tensorToNumbers(vectorEstResult.denoised_latent!);
+
+      let idx = 0;
+      for (let b = 0; b < noisyLatent.length; b++) {
+        for (let d = 0; d < noisyLatent[b]!.length; d++) {
+          for (let t = 0; t < noisyLatent[b]![d]!.length; t++) {
+            noisyLatent[b]![d]![t] = denoisedLatent[idx++]!;
+          }
+        }
+      }
+    }
+
+    const vocoderResult = await this.vocoderOrt.run({
+      latent: arrayToTensor(noisyLatent, latentShape),
+    });
+
+    return { wav: tensorToNumbers(vocoderResult.wav_tts!), duration: durOnnx };
+  }
+
+  /**
+   * Single-speaker synthesis with automatic chunking for long text. Chunks are
+   * joined with `silenceDuration` seconds of silence.
+   */
+  async call(
+    text: string,
+    lang: string,
+    style: Style,
+    totalStep: number,
+    speed = 1.05,
+    silenceDuration = 0.3,
+  ): Promise<{ wav: number[]; duration: number[] }> {
+    if (style.ttl.dims[0] !== 1) {
+      throw new Error("Single speaker text to speech only supports a single style");
+    }
+    const maxLen = lang === "ko" || lang === "ja" ? 120 : 300;
+    const textList = chunkText(text, maxLen);
+
+    let wavCat: number[] | null = null;
+    let durCat = 0;
+
+    for (const chunk of textList) {
+      const { wav, duration } = await this.infer([chunk], [lang], style, totalStep, speed);
+      if (wavCat === null) {
+        wavCat = wav;
+        durCat = duration[0]!;
+      } else {
+        const silenceLen = Math.floor(silenceDuration * this.sampleRate);
+        const silence = new Array<number>(silenceLen).fill(0);
+        wavCat = [...wavCat, ...silence, ...wav];
+        durCat += duration[0]! + silenceDuration;
+      }
+    }
+
+    return { wav: wavCat ?? [], duration: [durCat] };
+  }
+
+  /** Batch synthesis (one style + lang per text), no automatic chunking. */
+  async batch(
+    textList: string[],
+    langList: string[],
+    style: Style,
+    totalStep: number,
+    speed = 1.05,
+  ): Promise<{ wav: number[]; duration: number[] }> {
+    return this.infer(textList, langList, style, totalStep, speed);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Loaders
+// ---------------------------------------------------------------------------
+
+/**
+ * Load the four ONNX models + config + tokenizer from `onnxDir`. CPU only;
+ * upstream has not yet shipped a GPU path.
+ */
+export async function loadTextToSpeech(onnxDir: string): Promise<TextToSpeech> {
+  const cfgs: TtsConfig = JSON.parse(readFileSync(join(onnxDir, "tts.json"), "utf8"));
+  const opts = {};
+
+  const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = await Promise.all([
+    ort.InferenceSession.create(join(onnxDir, "duration_predictor.onnx"), opts),
+    ort.InferenceSession.create(join(onnxDir, "text_encoder.onnx"), opts),
+    ort.InferenceSession.create(join(onnxDir, "vector_estimator.onnx"), opts),
+    ort.InferenceSession.create(join(onnxDir, "vocoder.onnx"), opts),
+  ]);
+
+  const textProcessor = new UnicodeProcessor(join(onnxDir, "unicode_indexer.json"));
+  return new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
+}
+
+/** Write a mono 16-bit PCM WAV file. Samples are clamped to [-1, 1]. */
+export function writeWavFile(filename: string, audioData: number[], sampleRate: number): void {
+  const numChannels = 1;
+  const bitsPerSample = 16;
+  const byteRate = (sampleRate * numChannels * bitsPerSample) / 8;
+  const blockAlign = (numChannels * bitsPerSample) / 8;
+  const dataSize = (audioData.length * bitsPerSample) / 8;
+
+  const buffer = Buffer.alloc(44 + dataSize);
+
+  buffer.write("RIFF", 0);
+  buffer.writeUInt32LE(36 + dataSize, 4);
+  buffer.write("WAVE", 8);
+
+  buffer.write("fmt ", 12);
+  buffer.writeUInt32LE(16, 16);
+  buffer.writeUInt16LE(1, 20); // PCM
+  buffer.writeUInt16LE(numChannels, 22);
+  buffer.writeUInt32LE(sampleRate, 24);
+  buffer.writeUInt32LE(byteRate, 28);
+  buffer.writeUInt16LE(blockAlign, 32);
+  buffer.writeUInt16LE(bitsPerSample, 34);
+
+  buffer.write("data", 36);
+  buffer.writeUInt32LE(dataSize, 40);
+
+  for (let i = 0; i < audioData.length; i++) {
+    const sample = Math.max(-1, Math.min(1, audioData[i]!));
+    buffer.writeInt16LE(Math.floor(sample * 32767), 44 + i * 2);
+  }
+
+  writeFileSync(filename, buffer);
+}
+
+/** Split text into <= maxLen segments on paragraph then sentence boundaries. */
+function chunkText(text: string, maxLen = 300): string[] {
+  const paragraphs = text
+    .trim()
+    .split(/\n\s*\n+/)
+    .filter((p) => p.trim());
+
+  const chunks: string[] = [];
+
+  for (let paragraph of paragraphs) {
+    paragraph = paragraph.trim();
+    if (!paragraph) continue;
+
+    const sentences = paragraph.split(
+      /(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/,
+    );
+
+    let currentChunk = "";
+    for (const sentence of sentences) {
+      if (currentChunk.length + sentence.length + 1 <= maxLen) {
+        currentChunk += (currentChunk ? " " : "") + sentence;
+      } else {
+        if (currentChunk) chunks.push(currentChunk.trim());
+        currentChunk = sentence;
+      }
+    }
+    if (currentChunk) chunks.push(currentChunk.trim());
+  }
+
+  return chunks;
+}
diff --git a/packages/cli/src/utils/download.ts b/packages/cli/src/utils/download.ts
index 5b8825625..c8cf10f0e 100644
--- a/packages/cli/src/utils/download.ts
+++ b/packages/cli/src/utils/download.ts
@@ -1,6 +1,7 @@
 import { createWriteStream, renameSync, unlinkSync } from "node:fs";
 import { get as httpsGet } from "node:https";
 import { pipeline } from "node:stream/promises";
+import { URL } from "node:url";
 
 /**
  * Download a file from a URL, following redirects.
@@ -10,12 +11,16 @@ import { pipeline } from "node:stream/promises";
 export function downloadFile(url: string, dest: string): Promise<void> {
   const tmp = `${dest}.tmp`;
   return new Promise((resolve, reject) => {
-    const follow = (u: string) => {
+    const follow = (u: string, redirects = 0) => {
       httpsGet(u, (res) => {
-        if (res.statusCode === 301 || res.statusCode === 302) {
+        if ([301, 302, 303, 307, 308].includes(res.statusCode ?? 0)) {
           const location = res.headers.location;
           if (location) {
-            follow(location);
+            if (redirects >= 10) {
+              reject(new Error("Download failed: too many redirects"));
+              return;
+            }
+            follow(new URL(location, u).toString(), redirects + 1);
             return;
           }
         }
diff --git a/skills/hyperframes-media/SKILL.md b/skills/hyperframes-media/SKILL.md
index 13e3d4ae0..c6a296adb 100644
--- a/skills/hyperframes-media/SKILL.md
+++ b/skills/hyperframes-media/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: hyperframes-media
-description: Asset preprocessing for HyperFrames compositions — text-to-speech narration (Kokoro), audio/video transcription (Whisper), and background removal for transparent overlays (u2net). Use when generating voiceover from text, transcribing speech for captions, removing the background from a video or image to use as a transparent overlay, choosing a TTS voice or whisper model, or chaining these (TTS → transcribe → captions). Each command downloads its own model on first run.
+description: Asset preprocessing for HyperFrames compositions — text-to-speech narration (Kokoro for English/Chinese, Supertonic 3 for 31 languages), audio/video transcription (Whisper), and background removal for transparent overlays (u2net). Use when generating voiceover from text, transcribing speech for captions, removing the background from a video or image to use as a transparent overlay, choosing a TTS engine/voice or whisper model, or chaining these (TTS → transcribe → captions). Each command downloads its own model on first run.
 ---
 
 # HyperFrames Media Preprocessing
@@ -9,14 +9,29 @@ Three CLI commands that produce assets for compositions: `tts` (speech), `transc
 
 ## Text-to-Speech (`tts`)
 
-Generate speech audio locally with Kokoro-82M. No API key.
+Generate speech audio locally. No API key. Two engines, selected with `--engine`:
+
+- **`kokoro`** (default) — Kokoro-82M. 54 named voices, content-matched. Best for **English**; the only engine that supports **Chinese**. Non-English needs Python + `espeak-ng` (see Requirements).
+- **`supertonic`** — Supertonic 3. Runs fully in-process (no Python, no phonemizer). Covers **31 languages**. Preferred for **any non-English language except Chinese**.
 
 ```bash
 npx hyperframes tts "Text here" --voice af_nova --output narration.wav
 npx hyperframes tts script.txt --voice bf_emma --output narration.wav
-npx hyperframes tts --list                       # all 54 voices
+npx hyperframes tts --list                                 # Kokoro's 54 voices
+npx hyperframes tts --list --engine supertonic             # Supertonic's voices
 ```
 
+### Choosing an engine (language routing)
+
+| Language                                                         | Engine                  | Why                              |
+| ---------------------------------------------------------------- | ----------------------- | -------------------------------- |
+| English                                                          | `kokoro` (default)      | Rich, content-matched voices     |
+| Chinese / Mandarin                                               | `kokoro` (`zf_xiaobei`) | **Only** Kokoro supports `zh`    |
+| Any other language (Korean, German, Russian, Arabic, Dutch, …)   | `supertonic`            | Kokoro can't; no extra deps      |
+| Overlap (Spanish, French, Hindi, Italian, Japanese, Portuguese)  | either                  | Supertonic if avoiding espeak-ng |
+
+Rule of thumb: **English or Chinese → Kokoro. Everything else → Supertonic.**
+
 ### Voice Selection
 
 Match voice to content. Default is `af_heart`.
@@ -31,14 +46,23 @@ Match voice to content. Default is `af_heart`.
 
 ### Multilingual
 
-Voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from the prefix — no `--lang` needed when the voice matches the text.
+For non-English (except Chinese), use **Supertonic**. Pass the language with `--lang` and pick a voice (`F1`–`F5`, `M1`–`M5` — multilingual, gender only). No phonemizer or system packages needed.
+
+```bash
+npx hyperframes tts "안녕하세요, 만나서 반갑습니다" --engine supertonic --lang ko --voice F1 --output ko.wav
+npx hyperframes tts "Guten Tag, schön Sie zu sehen" --engine supertonic --lang de --voice M1 --output de.wav
+```
+
+Supertonic `--lang` codes (31): `ar` `bg` `cs` `da` `de` `el` `en` `es` `et` `fi` `fr` `hi` `hr` `hu` `id` `it` `ja` `ko` `lt` `lv` `nl` `pl` `pt` `ro` `ru` `sk` `sl` `sv` `tr` `uk` `vi`. **No Chinese** — use Kokoro for that.
+
+**Kokoro** stays best for English and is the only option for Chinese. Its voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from the prefix — no `--lang` needed when the voice matches the text.
 
 ```bash
+npx hyperframes tts "你好，今天天气很好" --voice zf_xiaobei --output zh.wav   # Chinese → Kokoro
 npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
-npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav
 ```
 
-Use `--lang` only to override auto-detection (stylized accents). Valid codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. Non-English phonemization requires `espeak-ng` system-wide (`brew install espeak-ng` / `apt-get install espeak-ng`).
+For Kokoro, use `--lang` only to override auto-detection (stylized accents). Valid codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. Kokoro non-English phonemization requires `espeak-ng` system-wide (`brew install espeak-ng` / `apt-get install espeak-ng`) — another reason to prefer Supertonic for those languages.
 
 ### Speed
 
@@ -53,7 +77,8 @@ For more than a few paragraphs, write to a `.txt` file and pass the path. Inputs
 
 ### Requirements
 
-Python 3.8+ with `kokoro-onnx` and `soundfile` (`pip install kokoro-onnx soundfile`). Model downloads on first use (~311 MB + ~27 MB voices, cached in `~/.cache/hyperframes/tts/`).
+- **Kokoro** (`--engine kokoro`, default): Python 3.8+ with `kokoro-onnx` and `soundfile` (`pip install kokoro-onnx soundfile`); non-English also needs `espeak-ng` system-wide. Model downloads on first use (~311 MB + ~27 MB voices, cached in `~/.cache/hyperframes/tts/`).
+- **Supertonic** (`--engine supertonic`): no Python, no system packages — runs in-process via onnxruntime-node. Models (~300 MB) download from Hugging Face on first use, cached in `~/.cache/hyperframes/tts/supertonic/`. Tune quality/speed with `--steps` (default 8; fewer is faster).
 
 ## Transcription (`transcribe`)