Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions optimum/executorch/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,8 +1346,8 @@ def text_generation(

# Sanity check
if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.bos_token_id:
raise ValueError(
f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
logging.warning(
f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} is not the same as the model's bos_token_id={self.bos_token_id}."
)
if isinstance(self.tokenizer, PreTrainedTokenizer) and not verify_eos_tokens_in_pretrained_tokenizer(
self.eos_token_id, self.tokenizer
Expand Down
60 changes: 53 additions & 7 deletions optimum/exporters/executorch/integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from transformers import (
AutoConfig,
AutoProcessor,
AutoTokenizer,
PreTrainedModel,
StaticCache,
T5ForConditionalGeneration,
Expand All @@ -34,18 +35,63 @@

from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache

from .utils import apply_chat_template_with_fallback, save_config_to_constant_methods
from .utils import apply_chat_template_with_fallback, process_conversation_inputs, save_config_to_constant_methods

def _patch_idefics3_vision_embeddings_for_export(vision_model):
"""
Patch Idefics3VisionEmbeddings to make it export-friendly by removing data-dependent operations.
This assumes batch_size=1 and a full attention mask (all 1s).
"""
import types

def export_friendly_forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
Copy link
Collaborator Author

@jackzhxng jackzhxng Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zucchini-nlp This part could not export because of the data dependent loop, I unroll it here to just handle one image. Here's the original code -
https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/modeling_idefics3.py#L149

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, i thought it was fixed in huggingface/transformers#39614

Copy link
Collaborator Author

@jackzhxng jackzhxng Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah that one just exports the vision encoder, this one exports the get_image_features function which calls the vision encoder. I'm thinking that some code in this function might be confusing to export cc @tugsbayasgalan

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh i see, prob it is the part where images are unpadded and that code is value dependant.

For my understanding, can we export the vision and the LM part separately but not the merging logic? Most get_image_features might not be 100% exportable for VLMs, so keeping it outside of export can work better on long term

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason we export get_image_features is that if we only exported the vision encoder, we would need to write the rest of the merging logic in C++ which is difficult to do and harder to scale. I wonder if it's possible to upstream an exportable version of get_image_features? If not, I'm happy to just monkey patch like this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it's possible to upstream an exportable version of get_image_features

This would be the best solution, and if you want to submit a PR it'll very welcome. I see that we're passing pixel_attention_mask=None in all cases in this PR but the vision backbone is still not exportable?

batch_size, _, max_im_h, max_im_w = pixel_values.shape

patch_embeds = self.patch_embedding(pixel_values)
embeddings = patch_embeds.flatten(2).transpose(1, 2)

nb_patches_h = max_im_h // self.patch_size
nb_patches_w = max_im_w // self.patch_size
N = self.num_patches_per_side

# For export, we assume full attention mask and compute position IDs statically.
# This avoids the data-dependent loop over batch dimension.
h_indices = torch.arange(nb_patches_h, device=pixel_values.device, dtype=torch.long)
w_indices = torch.arange(nb_patches_w, device=pixel_values.device, dtype=torch.long)

# This replaces bucketize(x, boundaries=[1/N, 2/N, ...], right=True) ≈ floor(x * N), which
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the second change, we don't have a kernel for aten.bucketize so I compute it manually

# we don't have a kernel for at the moment.
bucket_coords_h = (h_indices * N) // nb_patches_h
bucket_coords_w = (w_indices * N) // nb_patches_w

bucket_coords_h = torch.clamp(bucket_coords_h, max=N - 1)
bucket_coords_w = torch.clamp(bucket_coords_w, max=N - 1)

pos_ids = (bucket_coords_h[:, None] * N + bucket_coords_w[None, :]).reshape(-1)
position_ids = pos_ids.unsqueeze(0).expand(batch_size, -1)
embeddings = embeddings + self.position_embedding(position_ids)
return embeddings

# Patch the forward method.
vision_model.embeddings.forward = types.MethodType(export_friendly_forward, vision_model.embeddings)


class VisionExportableModule(torch.nn.Module):
def __init__(self, model: torch.nn.Module):
super().__init__()
self.model = model

# Patch Idefics3 vision embeddings if needed
if hasattr(model, 'model') and hasattr(model.model, 'vision_model'):
model_type = getattr(model.config, 'model_type', '')
if 'idefics3' in model_type.lower():
_patch_idefics3_vision_embeddings_for_export(model.model.vision_model)

def prepare_export_inputs(self):
# 1. Get export inputs
model_id = self.model.config.name_or_path
processor = AutoProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
sample_conversation_with_image = [
{
"role": "user",
Expand All @@ -54,12 +100,10 @@ def prepare_export_inputs(self):
],
},
]
processed_inputs = processor.apply_chat_template(
processed_inputs = process_conversation_inputs(
processor,
tokenizer,
sample_conversation_with_image,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
if "pixel_values" not in processed_inputs:
raise ValueError(
Expand All @@ -76,7 +120,9 @@ def forward(
self,
input_features: torch.FloatTensor,
):
image_embeds = self.model.get_image_features(input_features)
# Pass pixel_attention_mask=None to avoid data-dependent operations during export.
# The model will create a mask full of 1s internally if None is passed.
image_embeds = self.model.get_image_features(input_features, pixel_attention_mask=None)
if isinstance(image_embeds, list):
image_embeds = torch.stack(image_embeds)
return image_embeds
Expand Down
15 changes: 13 additions & 2 deletions optimum/exporters/executorch/tasks/multimodal_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,19 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
"device": device,
},
)
decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name

# Most <Model>ForConditionalGeneration> will have the text_model and encoder models as attributes, however
# some have `self.model = <Model>` (the base version not for conditional generation), and this `self.model`
# contains the text_model and encoder model attributes.
if hasattr(eager_model, "model"):
decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model.model)
# Set these as top level attributes.
setattr(eager_model, decoder_name, getattr(eager_model.model, decoder_name))
encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
setattr(eager_model, encoder_name, getattr(eager_model.model, encoder_name))
else:
decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name

# Need to do this since apparently when nested modules (e.g. model.language_model) access the .property
# config, it always comes from the generation_config.json file, not the `generation_config` override
Expand Down
36 changes: 30 additions & 6 deletions optimum/exporters/executorch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,12 @@ def process_conversation_inputs(
input_conversation: List[Dict[str, Any]],
):
"""
Process input conversation for multimodal models.

This function handles the preprocessing of conversation inputs, with special handling for
GraniteSpeechProcessor which requires extracting and processing audio content from conversations
prior to feeding into the processor.
Process an input conversation into tensor inputs for multimodal models.

Args:
processor: The processor to use for input processing
tokenizer: The tokenizer to use for text processing
input_conversation: List of conversation messages, may contain audio content
input_conversation: List of conversation messages

Returns:
Processed inputs ready for model consumption
Expand Down Expand Up @@ -190,6 +186,34 @@ def process_conversation_inputs(
# Generate text prompt and process with audio
prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
inputs = processor(prompt, wav, return_tensors="pt")
elif isinstance(processor, transformers.SmolVLMProcessor):
from transformers.image_utils import load_image

conversation = copy.deepcopy(input_conversation)
images = []

# Extract image URLs from conversation
for message in conversation:
if isinstance(message.get("content"), list):
# Filter out image entries and collect URLs
image_urls = [item["url"] for item in message["content"] if item.get("type") == "image"]
images.extend([load_image(url) for url in image_urls])

# Remove image entries from content
message["content"] = [item for item in message["content"] if item.get("type") != "image"]

# Apply chat template to get text prompt
prompt = apply_chat_template_with_fallback(
processor,
conversation,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)

# Process with text and images
inputs = processor(text=prompt, images=images, return_tensors="pt")
else:
# Standard processing for other processors
inputs = apply_chat_template_with_fallback(
Expand Down
Loading