diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 8f071eac2201..2573bb0441fd 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -421,6 +421,8 @@ def forward(
     dummy_inputs=UltravoxDummyInputsBuilder,
 )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -663,6 +665,7 @@ def pad_and_concat_to_dim3(
         if features.ndim > 3:
             # Flatten [B, N, 80, M] -> [B * N, 80, M]
             features = flatten_bn(features)
+
         return features
 
     features = [pad_and_concat_to_dim3(f) for f in features]
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 0d77b72675e2..f929ba9913ec 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -61,7 +61,7 @@
 )
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
-from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix
+from .utils import init_vllm_registered_model, maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -337,6 +337,8 @@ def _get_data_parser(self) -> MultiModalDataParser:
 class VoxtralForConditionalGeneration(
     nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
+    merge_by_field_config = True
+
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     packed_modules_mapping = {
@@ -445,7 +447,6 @@ def _parse_and_validate_audio_arrays(
                 f"Incorrect type of audio_arrays. Got type: {type(audio_arrays)}"
             )
 
-        audio_arrays = flatten_bn(audio_arrays)
         if isinstance(audio_arrays, torch.Tensor):
             audio_arrays = list(audio_arrays.unbind(0))
         return audio_arrays
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index ce9634935d24..397556cbbcc4 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
@@ -51,6 +51,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
@@ -135,7 +136,10 @@ class WhisperAudioInputs(TensorSchema):
         - t: Time frames (M)
     """
 
-    input_features: Annotated[Optional[NestedTensors], TensorShape("b", "nmb", "t")]
+    input_features: Annotated[
+        Optional[list[torch.Tensor]],
+        TensorShape("b", "nmb", "t"),
+    ]
 
 
 class WhisperEncoderAttention(MultiHeadAttention):
@@ -781,6 +785,7 @@ def _get_prompt_updates(
 class WhisperForConditionalGeneration(
     nn.Module, SupportsTranscription, SupportsMultiModal
 ):
+    merge_by_field_config = True
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
@@ -936,12 +941,7 @@ def _parse_and_validate_audio_input(self, **kwargs: object) -> WhisperAudioInput
         input_features = kwargs.pop("input_features", None)
 
         if input_features is not None:
-            if not isinstance(input_features, (torch.Tensor, list)):
-                raise ValueError(
-                    "Incorrect type of audio features. "
-                    f"Got type: {type(input_features)}"
-                )
-            input_features = torch.cat([feat.to(self.dtype) for feat in input_features])
+            input_features = json_map_leaves(lambda x: x.to(self.dtype), input_features)
 
         return WhisperAudioInputs(input_features=input_features)