[Bugfix] Fix RuntimeError: Index put requires the source and destination dtypes match (#22065)

chaunceyjiang · web-flow · commit 17eaaef59504 · 2025-08-07T19:20:21.000-07:00
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;
diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import torch
+from transformers import AutoConfig
+
+from tests.conftest import ImageTestAssets
+from tests.utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
+MAXIMUM_IMAGES = 2
+
+
+@pytest.fixture(scope="module")
+def default_image_embeds_server_args() -> list[str]:
+    return [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+
+@pytest.fixture(scope="module")
+def server_with_image_embeds(default_image_embeds_server_args):
+    with RemoteOpenAIServer(MODEL_NAME,
+                            default_image_embeds_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_image_embeds(server_with_image_embeds):
+    async with server_with_image_embeds.get_async_client() as async_client:
+        yield async_client
+
+
+def encode_image_embedding_to_base64(image_embedding) -> str:
+    """
+    Encode image embedding to base64 string
+    """
+    buffer = io.BytesIO()
+    torch.save(image_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    return base64_image_embedding
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
+async def test_completions_with_image_embeds(
+    client_with_image_embeds: openai.AsyncOpenAI,
+    model_name: str,
+    image_assets: ImageTestAssets,
+    dtype: torch.dtype,
+):
+    # Test case: Single image embeds input
+    image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
+    base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
+    chat_completion = await client_with_image_embeds.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type":
+                        "text",
+                        "text":
+                        "Describe these images separately. For each image,"
+                        "reply with a short sentence (no more than 10 words).",
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding,
+                    },
+                ],
+            },
+        ],
+        model=model_name,
+    )
+    assert chat_completion.choices[0].message.content is not None
+    assert isinstance(chat_completion.choices[0].message.content, str)
+    assert len(chat_completion.choices[0].message.content) > 0
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
@@ -401,7 +401,7 @@ def merge_multimodal_embeddings_from_map(
     """
     flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
     inputs_embeds[placeholder_map.dest] = flattened_embeddings[
-        placeholder_map.src]
+        placeholder_map.src].to(dtype=inputs_embeds.dtype)
     return inputs_embeds
 
 
@@ -421,7 +421,8 @@ def _merge_multimodal_embeddings(
     flattened = _flatten_embeddings(multimodal_embeddings)
     try:
         # This is equivalent to: inputs_embeds[is_multimodal] = flattened.
-        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), flattened)
+        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1),
+                                      flattened.to(dtype=inputs_embeds.dtype))
     except RuntimeError as e:
         num_expected_tokens = is_multimodal.sum().item()
         assert isinstance(num_expected_tokens, int)