Override get_input_embeddings in Eagle3 to process text-only inputs

rahul-tuli · rahul-tuli · commit 25def1ffff92 · 2025-09-30T15:29:56.000Z
Implement custom get_input_embeddings() in Eagle3LlamaForCausalLM that
accepts multimodal parameters but only processes text embeddings. This
ensures the Llama3-based Eagle3 drafter correctly handles text inputs
while remaining compatible with multimodal verifier interfaces.

The drafter receives multimodal context through auxiliary hidden states
from the verifier rather than processing multimodal inputs directly.

Signed-off-by: rahul-tuli &lt;rtuli@redhat.com&gt;
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
@@ -20,6 +20,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import (LlamaDecoderLayer,
                                               LlamaForCausalLM)
+from vllm.multimodal.inputs import NestedTensors
 
 from .utils import AutoWeightsLoader, maybe_prefix
 
@@ -242,8 +243,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             requires_grad=False,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+        is_multimodal: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # The llama3 drafter only processes text embeddings
+        return self.model.embed_tokens(input_ids)
 
     def forward(
         self,