feat: bump V2 to 0.4.10 with streaming ONNX and reInferWithEmotion()

jcggl · claude · jcggl · commit 64d0e06af92f · 2026-03-31T20:52:32.000+09:00
- V2 CDN 0.4.8 → 0.4.10 across all examples and configs
- Guide emotion sliders now use reInferWithEmotion() for real-time
  emotion changes during file playback (debounced 300ms)
- Mic streaming uses setEmotion() only (no reInfer to avoid LSTM reset)
- API Reference: add reInferWithEmotion() with usage notes and caveats
- V2 pipeline docs: streaming ONNX (UniLSTM + CausalTransformer + FiLM),
  5-frame chunks (~167ms), LSTM state carry

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.well-known/agent-card.json b/.well-known/agent-card.json
@@ -3,7 +3,7 @@
   "name": "AnimaSync",
   "description": "Voice-driven 3D avatar animation engine for the browser. Extracts emotion from speech and generates lip sync, facial expressions, and body motion in real time — entirely client-side via Rust/WASM and ONNX inference.",
   "url": "https://animasync.quasar.ggls.dev/",
-  "version": "0.4.8",
+  "version": "0.4.10",
   "provider": {
     "organization": "GoodGang Labs",
     "url": "https://goodganglabs.com"
diff --git a/README.md b/README.md
@@ -221,7 +221,9 @@ The production site is available at **[animasync.quasar.ggls.dev](https://animas
 ```
 Audio 16kHz PCM
   → [WASM] librosa-compatible features: 141-dim @30fps
-  → [JS]   ONNX emotion model + FiLM conditioning → 52-dim (lip sync + expressions)
+  → [JS]   Streaming ONNX (UniLSTM + CausalTransformer + FiLM) → 52-dim
+            Inputs: features + 5-dim emotion + LSTM h/c + conv context
+            Chunk size: 5 frames (~167ms), state carried between chunks
   → [WASM] crisp_mouth (mouth sharpening) → fade_in_out (natural onset/offset)
   → [WASM] add_blinks (stochastic eye animation)
   → [WASM] Preset blending: expression channels (brows, eyes) blended with lip sync
diff --git a/agents.json b/agents.json
@@ -150,7 +150,7 @@
         },
         {
           "action": "init-animasync",
-          "code": "<script type=\"module\">\nconst CDN = 'https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@0.4.8';\nconst { LipSyncWasmWrapper } = await import(`${CDN}/lipsync-wasm-wrapper.js`);\nconst lipsync = new LipSyncWasmWrapper({ wasmPath: `${CDN}/lipsync_wasm_v2.js` });\nawait lipsync.init();\n</script>",
+          "code": "<script type=\"module\">\nconst CDN = 'https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@0.4.10';\nconst { LipSyncWasmWrapper } = await import(`${CDN}/lipsync-wasm-wrapper.js`);\nconst lipsync = new LipSyncWasmWrapper({ wasmPath: `${CDN}/lipsync_wasm_v2.js` });\nawait lipsync.init();\n</script>",
           "description": "Import and initialize AnimaSync V2 from CDN"
         }
       ]
diff --git a/examples/guide/index.html b/examples/guide/index.html
@@ -953,7 +953,7 @@ <h2 class="step-title">Add Real-time Microphone</h2>
 // Config
 // ════════════════════════════════════════
 const VERSION_V1 = '0.4.5';
-const VERSION_V2 = '0.4.8';
+const VERSION_V2 = '0.4.10';
 const CDN_V1 = `https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v1@${VERSION_V1}`;
 const CDN_V2 = `https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@${VERSION_V2}`;
 let selectedEngine = 'v1';
@@ -1207,10 +1207,22 @@ <h2 class="step-title">Add Real-time Microphone</h2>
 const emotionSliders = EMOTION_KEYS.map(k => $(`emo-${k}`));
 const emotionVals = EMOTION_KEYS.map(k => $(`emo-${k}-val`));
 
+let _reInferTimer = null;
 function updateEmotionVector() {
   const vec = emotionSliders.map(s => parseInt(s.value) / 100);
-  if (lipsync?.setEmotion) {
-    try { lipsync.setEmotion(vec); } catch (e) { console.warn('setEmotion:', e.message); }
+  if (!lipsync?.setEmotion) return;
+  try { lipsync.setEmotion(vec); } catch (e) { console.warn('setEmotion:', e.message); return; }
+
+  // File playback: debounced re-inference with new emotion (300ms)
+  // Mic streaming: setEmotion() above is enough — each chunk uses current vector
+  if (filePlaying && fileResult && lipsync.reInferWithEmotion && !micActive) {
+    clearTimeout(_reInferTimer);
+    _reInferTimer = setTimeout(async () => {
+      try {
+        const newResult = await lipsync.reInferWithEmotion();
+        if (filePlaying) fileResult = newResult;
+      } catch (e) { console.warn('reInferWithEmotion:', e.message); }
+    }, 300);
   }
 }
 
diff --git a/examples/vanilla-avatar/index.html b/examples/vanilla-avatar/index.html
@@ -199,7 +199,7 @@ <h2>52 ARKit Blendshapes — V2 Emotion</h2>
 // No 3D avatar, no Three.js. Pure audio → lip sync data (52-dim).
 // ================================================================
 
-const VERSION = '0.4.8';
+const VERSION = '0.4.10';
 const CDN = `https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@${VERSION}`;
 
 // ── All 52 ARKit blendshape channels ──
diff --git a/examples/vanilla-comparison/index.html b/examples/vanilla-comparison/index.html
@@ -254,7 +254,7 @@ <h1>Anima<span>Sync</span></h1>
 // Config
 // ================================================================
 const VERSION_V1 = '0.4.5';
-const VERSION_V2 = '0.4.8';
+const VERSION_V2 = '0.4.10';
 const CDN_V1 = `https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v1@${VERSION_V1}`;
 const CDN_V2 = `https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@${VERSION_V2}`;
 
diff --git a/llms-full.txt b/llms-full.txt
@@ -138,6 +138,17 @@ lipsync.setEmotion([0, 0.8, 0, 0, 0]); // 80% joy
 
 Returns the current 5-dim emotion vector.
 
+#### `reInferWithEmotion(emotionVec?: number[]): Promise<ProcessResult>` (V2 only, v0.4.10+)
+
+Re-run ONNX inference on cached audio features with a new emotion vector, without re-uploading or re-decoding audio. Requires a prior `processFile()`/`processAudio()` call (uses internally cached features). Do NOT call during mic streaming — it resets LSTM state.
+
+```javascript
+const result = await lipsync.processFile(audioFile);
+// Later, change emotion without re-uploading:
+const joyResult = await lipsync.reInferWithEmotion([0, 1.0, 0, 0, 0]);
+const angryResult = await lipsync.reInferWithEmotion([0, 0, 0.8, 0, 0]);
+```
+
 #### `reset(): void`
 
 Clear streaming state. Call between utterances when using `processAudioChunk`.
@@ -171,7 +182,7 @@ interface ProcessResult {
 | VRM mode | getVrmFrame() + convert_arkit_to_vrm() for VRM 18-dim | getVrmFrame() for VRM 18-dim |
 | Voice activity | Built-in VoiceActivityDetector | Not included |
 | ONNX fallback | Heuristic mode (energy-based) | None (ONNX required) |
-| Emotion control | Not included | 5-dim FiLM conditioning (neutral, joy, anger, sadness, surprise) via setEmotion()/getEmotion() |
+| Emotion control | Not included | 5-dim FiLM conditioning (neutral, joy, anger, sadness, surprise) via setEmotion()/getEmotion()/reInferWithEmotion() |
 | Body motion | VRMA idle/speaking + VAD auto-switch (LoopPingPong, asymmetric crossfade) | VRMA idle/speaking (LoopPingPong, asymmetric crossfade 0.8s/1.0s) |
 | Best for | Full expression control, custom avatars | Emotion-aware lip sync, quick integration |
 
@@ -184,7 +195,9 @@ interface ProcessResult {
 ```
 Audio 16kHz PCM
   -> [WASM] librosa-compatible features: 141-dim @30fps
-  -> [JS]   ONNX emotion model + FiLM conditioning -> 52-dim (lip sync + expressions)
+  -> [JS]   Streaming ONNX (UniLSTM + CausalTransformer + FiLM) -> 52-dim
+             Inputs: features + 5-dim emotion + LSTM h/c + conv context
+             Chunk size: 5 frames (~167ms), state carried between chunks
   -> [WASM] crisp_mouth (mouth sharpening) -> fade_in_out (natural onset/offset)
   -> [WASM] add_blinks (stochastic eye animation)
   -> [WASM] Preset blending: expression channels blended with lip sync
@@ -224,7 +237,7 @@ Tongue: tongueOut
 
 | Example | Description | URL |
 |---------|-------------|-----|
-| Step-by-Step Guide | 6-step interactive tutorial with V1/V2 engine selector, V2 emotion control panel (5 sliders + presets), VRM mode auto-detect, idle eye blink, audio-synced playback, LoopPingPong idle, asymmetric crossfade (V1 0.4.5, V2 0.4.8) | https://animasync.quasar.ggls.dev/examples/guide/ |
+| Step-by-Step Guide | 6-step interactive tutorial with V1/V2 engine selector, V2 emotion control panel (5 sliders + presets), VRM mode auto-detect, idle eye blink, audio-synced playback, LoopPingPong idle, asymmetric crossfade (V1 0.4.5, V2 0.4.10) | https://animasync.quasar.ggls.dev/examples/guide/ |
 | V1 Data | V1 phoneme engine — 52 ARKit blendshapes visualization | https://animasync.quasar.ggls.dev/examples/vanilla-basic/ |
 | V2 Data | V2 emotion model — 52 ARKit with 5-dim FiLM conditioning | https://animasync.quasar.ggls.dev/examples/vanilla-avatar/ |
 | V1 vs V2 | Side-by-side dual avatar comparison | https://animasync.quasar.ggls.dev/examples/vanilla-comparison/ |

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@`
`150`	`150`	`},`
`151`	`151`	`{`
`152`	`152`	`"action": "init-animasync",`
`153`		- "code": "<script type=\"module\">\nconst CDN = 'https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@0.4.8';\nconst { LipSyncWasmWrapper } = await import(`${CDN}/lipsync-wasm-wrapper.js`);\nconst lipsync = new LipSyncWasmWrapper({ wasmPath: `${CDN}/lipsync_wasm_v2.js` });\nawait lipsync.init();\n</script>",
	`153`	+ "code": "<script type=\"module\">\nconst CDN = 'https://cdn.jsdelivr.net/npm/@goodganglabs/lipsync-wasm-v2@0.4.10';\nconst { LipSyncWasmWrapper } = await import(`${CDN}/lipsync-wasm-wrapper.js`);\nconst lipsync = new LipSyncWasmWrapper({ wasmPath: `${CDN}/lipsync_wasm_v2.js` });\nawait lipsync.init();\n</script>",
`154`	`154`	`"description": "Import and initialize AnimaSync V2 from CDN"`
`155`	`155`	`}`
`156`	`156`	`]`