huggingface · jackzhxng · Oct 8, 2025 · Oct 10, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -1346,8 +1346,8 @@ def text_generation(
 
         # Sanity check
         if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.bos_token_id:
-            raise ValueError(
-                f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
+            logging.warning(
+                f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} is not the same as the model's bos_token_id={self.bos_token_id}."
             )
         if isinstance(self.tokenizer, PreTrainedTokenizer) and not verify_eos_tokens_in_pretrained_tokenizer(
             self.eos_token_id, self.tokenizer

diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -22,6 +22,7 @@
 from transformers import (
     AutoConfig,
     AutoProcessor,
+    AutoTokenizer,
     PreTrainedModel,
     StaticCache,
     T5ForConditionalGeneration,
@@ -34,18 +35,63 @@
 
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 
-from .utils import apply_chat_template_with_fallback, save_config_to_constant_methods
+from .utils import apply_chat_template_with_fallback, process_conversation_inputs, save_config_to_constant_methods
+
+def _patch_idefics3_vision_embeddings_for_export(vision_model):
+    """
+    Patch Idefics3VisionEmbeddings to make it export-friendly by removing data-dependent operations.
+    This assumes batch_size=1 and a full attention mask (all 1s).
+    """
+    import types
+
+    def export_friendly_forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        nb_patches_h = max_im_h // self.patch_size
+        nb_patches_w = max_im_w // self.patch_size
+        N = self.num_patches_per_side
+
+        # For export, we assume full attention mask and compute position IDs statically.
+        # This avoids the data-dependent loop over batch dimension.
+        h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=torch.long)
+        w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=torch.long)
+
+        # This replaces bucketize(x, boundaries=[1/N, 2/N, ...], right=True) ≈ floor(x * N), which
+        # we don't have a kernel for at the moment.
+        bucket_coords_h = (h_indices * N) // nb_patches_h
+        bucket_coords_w = (w_indices * N) // nb_patches_w
+
+        bucket_coords_h = torch.clamp(bucket_coords_h, max=N - 1)
+        bucket_coords_w = torch.clamp(bucket_coords_w, max=N - 1)
+
+        pos_ids = (bucket_coords_h[:, None] * N + bucket_coords_w[None, :]).reshape(-1)
+        position_ids = pos_ids.unsqueeze(0).expand(batch_size, -1)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+    # Patch the forward method.
+    vision_model.embeddings.forward = types.MethodType(export_friendly_forward, vision_model.embeddings)
 
 
 class VisionExportableModule(torch.nn.Module):
     def __init__(self, model: torch.nn.Module):
         super().__init__()
         self.model = model
 
+        # Patch Idefics3 vision embeddings if needed
+        if hasattr(model, 'model') and hasattr(model.model, 'vision_model'):
+            model_type = getattr(model.config, 'model_type', '')
+            if 'idefics3' in model_type.lower():
+                _patch_idefics3_vision_embeddings_for_export(model.model.vision_model)
+
     def prepare_export_inputs(self):
         # 1. Get export inputs
         model_id = self.model.config.name_or_path
         processor = AutoProcessor.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
         sample_conversation_with_image = [
             {
                 "role": "user",
@@ -54,12 +100,10 @@ def prepare_export_inputs(self):
                 ],
             },
         ]
-        processed_inputs = processor.apply_chat_template(
+        processed_inputs = process_conversation_inputs(
+            processor,
+            tokenizer,
             sample_conversation_with_image,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
         )
         if "pixel_values" not in processed_inputs:
             raise ValueError(
@@ -76,7 +120,9 @@ def forward(
         self,
         input_features: torch.FloatTensor,
     ):
-        image_embeds = self.model.get_image_features(input_features)
+        # Pass pixel_attention_mask=None to avoid data-dependent operations during export.
+        # The model will create a mask full of 1s internally if None is passed.
+        image_embeds = self.model.get_image_features(input_features, pixel_attention_mask=None)
         if isinstance(image_embeds, list):
             image_embeds = torch.stack(image_embeds)
         return image_embeds

diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -180,8 +180,19 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
             "device": device,
         },
     )
-    decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
-    encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
+
+    # Most <Model>ForConditionalGeneration> will have the text_model and encoder models as attributes, however
+    # some have `self.model = <Model>` (the base version not for conditional generation), and this `self.model`
+    # contains the text_model and encoder model attributes.
+    if hasattr(eager_model, "model"):
+        decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model.model)
+        # Set these as top level attributes.
+        setattr(eager_model, decoder_name, getattr(eager_model.model, decoder_name))
+        encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
+        setattr(eager_model, encoder_name, getattr(eager_model.model, encoder_name))
+    else:
+        decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
+        encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
 
     # Need to do this since apparently when nested modules (e.g. model.language_model) access the .property
     # config, it always comes from the generation_config.json file, not the `generation_config` override

diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -139,16 +139,12 @@ def process_conversation_inputs(
     input_conversation: List[Dict[str, Any]],
 ):
     """
-    Process input conversation for multimodal models.
-
-    This function handles the preprocessing of conversation inputs, with special handling for
-    GraniteSpeechProcessor which requires extracting and processing audio content from conversations
-    prior to feeding into the processor.
+    Process an input conversation into tensor inputs for multimodal models.
 
     Args:
         processor: The processor to use for input processing
         tokenizer: The tokenizer to use for text processing
-        input_conversation: List of conversation messages, may contain audio content
+        input_conversation: List of conversation messages
 
     Returns:
         Processed inputs ready for model consumption
@@ -190,6 +186,34 @@ def process_conversation_inputs(
         # Generate text prompt and process with audio
         prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
         inputs = processor(prompt, wav, return_tensors="pt")
+    elif isinstance(processor, transformers.SmolVLMProcessor):
+        from transformers.image_utils import load_image
+
+        conversation = copy.deepcopy(input_conversation)
+        images = []
+
+        # Extract image URLs from conversation
+        for message in conversation:
+            if isinstance(message.get("content"), list):
+                # Filter out image entries and collect URLs
+                image_urls = [item["url"] for item in message["content"] if item.get("type") == "image"]
+                images.extend([load_image(url) for url in image_urls])
+
+                # Remove image entries from content
+                message["content"] = [item for item in message["content"] if item.get("type") != "image"]
+
+        # Apply chat template to get text prompt
+        prompt = apply_chat_template_with_fallback(
+            processor,
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        # Process with text and images
+        inputs = processor(text=prompt, images=images, return_tensors="pt")
     else:
         # Standard processing for other processors
         inputs = apply_chat_template_with_fallback(