Prep for Voice Steering feature (#141)

apresence · apresence · ylacombe · web-flow · commit 31816bdf80d0 · 2024-10-14T14:07:31.000+02:00
* Prep for Voice Steering feature Credits: 1. ylacombe - Add input_values to DACModel - dac_wrapper/modeling_dac.py - #110 (comment) 2. stg2015 - Delay mask adjustment for input_values - modeling_parler_tts.py - #81 (comment) * Prep for voice steering/cloning w/ fix for non-streaming generation * Applied simpler input handling per Guppy16's suggestion * Applied Guppy16's suggested optimization * Applied Guppy17's suggested optimization for voice steering * Update parler_tts/modeling_parler_tts.py --------- Co-authored-by: apresence <apresence@gmail.com> Co-authored-by: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
diff --git a/parler_tts/dac_wrapper/modeling_dac.py b/parler_tts/dac_wrapper/modeling_dac.py
@@ -12,6 +12,9 @@
 class DACModel(PreTrainedModel):
     config_class = DACConfig
 
+    # Set main input to 'input_values' for voice steering
+    main_input_name = "input_values"
+
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/parler_tts/modeling_parler_tts.py b/parler_tts/modeling_parler_tts.py
@@ -3483,13 +3483,17 @@ def generate(
         # Apply the pattern mask to the final ids
         output_ids = self.decoder.apply_delay_pattern_mask(output_ids, model_kwargs["decoder_delay_pattern_mask"])
 
-        # Revert the pattern delay mask by filtering the eos and bos token ids from the delay pattern mask
-        _, mask = self.decoder.build_delay_pattern_mask(
-            input_ids,
-            bos_token_id=generation_config._bos_token_tensor,
-            pad_token_id=generation_config._pad_token_tensor,
-            max_length=output_ids.shape[1],
-        )
+        if "input_values" in model_kwargs:
+            # Handle input_values for voice steering
+            mask = output_ids
+        else:
+            # Revert the pattern delay mask by filtering the eos and bos token ids from the delay pattern mask
+            _, mask = self.decoder.build_delay_pattern_mask(
+                input_ids,
+                bos_token_id=generation_config.bos_token_id,
+                pad_token_id=generation_config.pad_token_id,
+                max_length=output_ids.shape[1],
+            )
 
         mask = (mask != generation_config.bos_token_id) & (mask != generation_config.pad_token_id)
         output_ids = output_ids[mask].reshape(batch_size, self.decoder.num_codebooks, -1)