[Bugfix][V1] Only get input embeddings w/ multi-modal models if first PP (#17916)

jinhuang12 · Jin Huang · web-flow · commit 8dd0671baca6 · 2025-05-13T15:10:07.000+08:00
Signed-off-by: Jin Huang &lt;jinhun@amazon.com&gt;
Co-authored-by: Jin Huang &lt;jinhun@amazon.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1107,7 +1107,7 @@ def execute_model(
         else:
             mm_embeds = []
 
-        if self.is_multimodal_model:
+        if self.is_multimodal_model and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.