refactor(multimodal U-Z): Migrate MM models to merge_by_field_config

ayushsatyam146 · ayushsatyam146 · commit a7615f962fd6 · 2025-10-05T22:54:17.000+05:30
Migrate step3_vl, tarsier, terratorch, ultravox, voxtral, and whisper to
use merge_by_field_config = True, enabling HF-compatible input shapes.
Remove flatten_bn calls and dead flatten_and_concat function.

Signed-off-by: Ayush Satyam &lt;ayushsatyam146@gmail.com&gt;
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
@@ -48,7 +48,6 @@
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    flatten_bn,
     init_vllm_registered_model,
     maybe_prefix,
 )
@@ -421,6 +420,8 @@ def forward(
     dummy_inputs=UltravoxDummyInputsBuilder,
 )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -547,9 +548,9 @@ def _process_audio_input(
         # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
         audio_features = pad_and_concat_to_dim3(audio_input["data"])
 
-        # [B1, B2] -> [B1+B2]
-        audio_lens = flatten_bn(audio_input["lens"], concat=True)
-        audio_token_len = flatten_bn(audio_input["token_len"], concat=True)
+        # Audio lens and token_len are already in the correct shape
+        audio_lens = audio_input["lens"]
+        audio_token_len = audio_input["token_len"]
 
         embeddings = self._audio_features_to_embeddings(audio_features, audio_lens)
 
@@ -662,7 +663,8 @@ def pad_and_concat_to_dim3(
     if isinstance(features, torch.Tensor):
         if features.ndim > 3:
             # Flatten [B, N, 80, M] -> [B * N, 80, M]
-            features = flatten_bn(features)
+            batch_size = features.shape[0] * features.shape[1]
+            features = features.view(batch_size, *features.shape[2:])
         return features
 
     features = [pad_and_concat_to_dim3(f) for f in features]
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
@@ -61,7 +61,7 @@
 )
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
-from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix
+from .utils import init_vllm_registered_model, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -337,6 +337,8 @@ def _get_data_parser(self) -> MultiModalDataParser:
 class VoxtralForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
+    merge_by_field_config = True
+
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
@@ -445,7 +447,6 @@ def _parse_and_validate_audio_arrays(
                 f"Incorrect type of audio_arrays. Got type: {type(audio_arrays)}"
             )
 
-        audio_arrays = flatten_bn(audio_arrays)
         if isinstance(audio_arrays, torch.Tensor):
             audio_arrays = list(audio_arrays.unbind(0))
         return audio_arrays
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
@@ -781,6 +781,7 @@ def _get_prompt_updates(
 class WhisperForConditionalGeneration(
     nn.Module, SupportsTranscription, SupportsMultiModal
 ):
+    merge_by_field_config = True
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",