fix(vibevoice): preserve quantization metadata in sanitize() for quantized model loading

korale77 · korale77 · commit a73408bdfdc5 · 2026-03-25T04:11:49.000-07:00
sanitize() drops weight keys not found in the model's current parameter shapes. Since the model isn't quantized yet at sanitize time, quantization metadata keys (.scales, .biases) are silently removed. Later, apply_quantization() checks for these keys to decide which layers to quantize -- finds nothing -- skips quantization -- and loading fails with a shape mismatch. Preserve .scales and .biases keys through sanitization, matching the existing pattern in chatterbox/s3gen. Same class of bug as Blaizzy#584 (fish_qwen3_omni sanitize fix).
diff --git a/mlx_audio/tts/models/vibevoice/vibevoice.py b/mlx_audio/tts/models/vibevoice/vibevoice.py
@@ -250,8 +250,9 @@ def transform_key(key: str) -> str:
 
             # Check if key exists in model
             if new_key not in curr_shapes:
-                # Debug: uncomment to see missing keys
-                # print(f"Warning: Key {new_key} (from {k}) not found in model")
+                # Preserve quantization metadata -- model isn't quantized yet at sanitize time
+                if new_key.endswith((".scales", ".biases")):
+                    new_weights[new_key] = v
                 continue
 
             target_shape = curr_shapes[new_key]
diff --git a/mlx_audio/tts/tests/test_models.py b/mlx_audio/tts/tests/test_models.py
@@ -1318,6 +1318,31 @@ def test_sanitize_huggingface_keys(self):
         self.assertNotIn("model.prediction_head.t_embedder.mlp.0.weight", sanitized)
         self.assertNotIn("model.prediction_head.adaLN_modulation.1.weight", sanitized)
 
+    def test_sanitize_preserves_quantization_metadata(self):
+        """Test that sanitize preserves .scales and .biases for quantized models."""
+        from mlx.utils import tree_flatten
+
+        from mlx_audio.tts.models.vibevoice.vibevoice import Model
+
+        config = self._default_config
+        model = Model(config)
+
+        # Start with the model's own weights
+        weights = dict(tree_flatten(model.parameters()))
+
+        # Add mock quantization metadata for the key from the bug report:
+        # "Expected shape (151936, 896) but received shape (151936, 224)
+        #  for parameter language_model.embed_tokens.weight"
+        quant_key = "language_model.embed_tokens.weight"
+        weights[f"{quant_key}.scales"] = mx.ones((1,))
+        weights[f"{quant_key}.biases"] = mx.ones((1,))
+
+        sanitized = model.sanitize(weights)
+
+        # Quantization metadata must survive sanitization
+        self.assertIn(f"{quant_key}.scales", sanitized)
+        self.assertIn(f"{quant_key}.biases", sanitized)
+
     def test_config_defaults(self):
         """Test VibeVoiceModel uses correct config defaults."""
         from mlx_audio.tts.models.vibevoice.config import ModelConfig