[Bugfix] should use stack instead of concat (#22972)

947132885 · Isotr0py · web-flow · commit fe0411fc6fa3 · 2025-08-17T08:46:36.000Z
Signed-off-by: 947132885 &lt;947132885@qq.com&gt;
Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;
Co-authored-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -694,6 +694,17 @@ def compute_logits(
         return logits
 
 
+def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
+    """Flatten until a list of tensors can be concatenated then do concat"""
+
+    def _can_concat(x: list[torch.Tensor]):
+        return len(set(map(lambda _x: _x.shape[1:], x))) == 1
+
+    if _can_concat(x):
+        return torch.concat(x)
+    return flatten_and_concat(flatten_bn(x))
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
@@ -766,8 +777,7 @@ def get_multimodal_embeddings(self, **kwargs):
             if isinstance(pixel_values, torch.Tensor):
                 pixel_values = flatten_bn(pixel_values).to(self.dtype)
             elif is_list_of(pixel_values, torch.Tensor):
-                pixel_values = flatten_bn(flatten_bn(pixel_values),
-                                          concat=True).to(self.dtype)
+                pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
             else:
                 raise ValueError(
                     f"Unsupported pixel_values type {type(pixel_values)}. "