feat: Add multimodal input support to Eagle3 Llama4

rahul-tuli · claude · rahul-tuli · commit 94bc6baa10e4 · 2025-08-21T13:29:31.000Z
Enables multimodal input processing for Eagle3 speculative decoding
with Llama4 models, supporting vision and other modalities.

Key changes:
- Updated get_input_embeddings to support multimodal embeddings
- Added merge_multimodal_embeddings integration
- Proper handling of image_token_index configuration
- Maintains compatibility with existing text-only workflows

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: Rahul Tuli &lt;rtuli@redhat.com&gt;
diff --git a/vllm/model_executor/models/llama4_eagle3.py b/vllm/model_executor/models/llama4_eagle3.py
@@ -143,6 +143,7 @@ def forward(
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
+        multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Forward pass for Eagle3 draft generation.
@@ -152,6 +153,7 @@ def forward(
             positions: Position indices for rotary embeddings
             hidden_states: Auxiliary hidden states from target model
             inputs_embeds: Pre-computed input embeddings (optional)
+            multimodal_embeddings: Multimodal embeddings (optional)
             
         Returns:
             Tuple of (hidden_states, hidden_states) following vLLM convention
@@ -160,6 +162,15 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings(input_ids)
 
+            # Apply multimodal embeddings if provided
+            if multimodal_embeddings is not None:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    multimodal_embeddings,
+                    getattr(self.config, "image_token_index", None),
+                )
+
         # Eagle3 pattern: auxiliary hidden states have same dimension as embeddings
         # This assertion ensures compatibility for the single decoder layer
         assert hidden_states.shape[-1] == inputs_embeds.shape[-1], (
@@ -376,12 +387,6 @@ def forward(
         Returns:
             Tuple of (hidden_states, hidden_states) for vLLM compatibility
         """
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                f"{type(self).__name__} does not support multimodal inputs yet. "
-                "Multimodal support for Eagle3 is planned for future releases."
-            )
-
         return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
     def compute_logits(