From 3a960bb88d2ba76205a27ba975019f1256ae48f0 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 31 Jul 2025 17:46:41 -0700 Subject: [PATCH 01/31] Bump transformers to 4.54.1 --- install_dev.py | 2 +- .../executorch/attentions/custom_kv_cache.py | 67 +++++++++---------- optimum/exporters/executorch/integrations.py | 4 +- optimum/exporters/executorch/utils.py | 4 +- setup.py | 2 +- 5 files changed, 38 insertions(+), 41 deletions(-) diff --git a/install_dev.py b/install_dev.py index 5cf313ff..14012554 100644 --- a/install_dev.py +++ b/install_dev.py @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.53.1 + "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index b5416f87..ab7e485e 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for _ in range(config.num_hidden_layers): + for layer in self.layers: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,32 +202,29 @@ def __init__( layer_device_map=layer_device_map, ) - # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers + # Create a list of cache instances, one per layer. + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. self.kv_cache = torch.nn.ModuleList() - for layer_idx in range(config.num_hidden_layers): - # newer version of transfomer has is_sliding defined - # for HybridCache - if self.is_sliding[layer_idx]: + for layer in self.layers: + if layer.is_sliding(): # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.sliding_window_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.is_sliding[layer_idx]: + if self.layers[layer_idx].is_sliding(): # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache. + Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. This modifies the model in place. Args: @@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Dont know why we need to this even though @@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.is_sliding[i]: + if module.cache.layers[i].is_sliding(): # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 23e6819a..06522c27 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -395,8 +395,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.max_cache_len, - batch_size=self.generation_config.cache_config.batch_size, + max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), + batch_size=self.generation_config.cache_config.get("batch_size"), ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index 70447957..e805ad4e 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = getattr(cache_config, "batch_size", None) - max_seq_len = getattr(cache_config, "max_cache_len", None) + max_batch_size = cache_config.get("batch_size") + max_seq_len = cache_config.get("max_cache_len") if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index c7fa93ed..7d447fa6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.51.3", + "transformers==4.54.1", ] TESTS_REQUIRE = [ From 3d223a2cc232ca367f2a42039837261d6c3c5c12 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 31 Jul 2025 18:08:19 -0700 Subject: [PATCH 02/31] Bump torch --- install_dev.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/install_dev.py b/install_dev.py index 14012554..87564c3d 100644 --- a/install_dev.py +++ b/install_dev.py @@ -6,9 +6,9 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" EXECUTORCH_NIGHTLY_VERSION = "dev20250625" - TORCHAO_NIGHTLY_VERSION = "dev20250620" + TORCHAO_NIGHTLY_VERSION = "dev20250710" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 - TORCH_NIGHTLY_VERSION = "dev20250601" + TORCH_NIGHTLY_VERSION = "dev20250716" subprocess.check_call( [ sys.executable, @@ -16,8 +16,8 @@ def install_torch_nightly_deps(): "pip", "install", f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}", - f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}", - f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}", + f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", + f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}", "--extra-index-url", From 207f8b1c787efb878aae00973320503437b459f5 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:00:29 -0700 Subject: [PATCH 03/31] Fix no module found error for custom_kv_cache --- optimum/exporters/executorch/integrations.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 06522c27..35406190 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -28,6 +28,7 @@ ) from transformers.generation.configuration_utils import GenerationConfig +from executorch import version as executorch_version from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache from optimum.utils.import_utils import is_transformers_version @@ -89,7 +90,10 @@ def _prepare_export_inputs(self): return example_input_ids, example_cache_position, dynamic_shapes, strict def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module): - if is_transformers_version(">=", "4.53.0.dev0"): + if ( + is_transformers_version(">=", "4.53.0.dev0") + and parse(executorch_version.__version__).base_version > "0.6.0" + ): from transformers.integrations.executorch import sdpa_mask_without_vmap from transformers.masking_utils import AttentionMaskInterface from transformers.modeling_utils import AttentionInterface @@ -126,7 +130,7 @@ def export( ) self._register_attention_mask_for_4_53(exportable_module) - if self.use_custom_kv_cache: + if self.use_custom_kv_cache and parse(executorch_version.__version__).base_version > "0.6.0": from optimum.executorch.attentions.custom_kv_cache import ( replace_with_et_custom_kv_cache, ) From bc828412787ced35d92f97c5fb402905d04d8e76 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:07:11 -0700 Subject: [PATCH 04/31] Try to fix Missing operator: [8] quantized_decomposed::embedding_byte.out --- optimum/executorch/modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 28ffc2fd..710b3403 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -24,6 +24,7 @@ import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa from transformers import ( AutoModelForCausalLM, AutoModelForImageClassification, From 35fc91863f732546b2fbb669c80c7fa30d4bb29e Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:19:37 -0700 Subject: [PATCH 05/31] Fix quantization requires torchao >= 0.11.0 --- optimum/exporters/executorch/quantization.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py index c2e2028f..299a2ddc 100644 --- a/optimum/exporters/executorch/quantization.py +++ b/optimum/exporters/executorch/quantization.py @@ -26,10 +26,6 @@ def quantize_model_( if not (qlinear_config or qembedding_config): return - # TODO: Update torchao to use 0.11.0 once released - if parse(torchao.__version__) < parse("0.11.0.dev0"): - raise RuntimeError("Quantization requires torchao >= 0.11.0. Please upgrade torchao.") - from torchao.quantization.granularity import PerAxis, PerGroup from torchao.quantization.quant_api import ( Int8DynamicActivationIntxWeightConfig, From 6a26464ae6448c690d0c9b20350027b21b431728 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 17:09:51 -0700 Subject: [PATCH 06/31] Fix sliding window, print loaded ops --- optimum/executorch/attentions/custom_kv_cache.py | 6 +++--- optimum/executorch/modeling.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index ab7e485e..a9d912ab 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -210,7 +210,7 @@ def __init__( # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. self.kv_cache = torch.nn.ModuleList() for layer in self.layers: - if layer.is_sliding(): + if layer.is_sliding: # This is a sliding window layer layer_cache = CustomRingKVCache( max_batch_size=layer.max_batch_size, @@ -281,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.layers[layer_idx].is_sliding(): + if self.layers[layer_idx].is_sliding: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -385,7 +385,7 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.layers[i].is_sliding(): + if module.cache.layers[i].is_sliding: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 710b3403..03cb7593 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -186,6 +186,13 @@ def _from_pretrained( subfolder=subfolder, local_files_only=local_files_only, ) + + from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa + from executorch.kernels import quantized # noqa + from executorch.extension.pybindings.portable_lib import _get_operator_names + print("----------- LOADED OPS ----------") + print('\n'.join(_get_operator_names())) + print("---------------------------------") model = _load_for_executorch(model_cache_path) logging.info( f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)" From 4d68263c367468c1b2db47c3cbc53731027efa35 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 18:33:23 -0700 Subject: [PATCH 07/31] Bump ET nightly pin, fixes missing quantized ops --- install_dev.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install_dev.py b/install_dev.py index 87564c3d..096e3774 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250625" + EXECUTORCH_NIGHTLY_VERSION = "dev20250730" TORCHAO_NIGHTLY_VERSION = "dev20250710" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250716" @@ -15,7 +15,7 @@ def install_torch_nightly_deps(): "-m", "pip", "install", - f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}", + f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}", f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", From 6a3e1d4a074553443069fd7d8b28ad1324d9671b Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 4 Aug 2025 20:55:09 -0700 Subject: [PATCH 08/31] Fix no Q_ANNOTATION_KEY --- install_dev.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/install_dev.py b/install_dev.py index 096e3774..3fac4546 100644 --- a/install_dev.py +++ b/install_dev.py @@ -6,9 +6,9 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" EXECUTORCH_NIGHTLY_VERSION = "dev20250730" - TORCHAO_NIGHTLY_VERSION = "dev20250710" + TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 - TORCH_NIGHTLY_VERSION = "dev20250716" + TORCH_NIGHTLY_VERSION = "dev20250725" subprocess.check_call( [ sys.executable, @@ -19,7 +19,7 @@ def install_torch_nightly_deps(): f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", - f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}", + f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}", "--extra-index-url", "https://download.pytorch.org/whl/nightly/cpu", ] From 2b5fe7ec0993742d4714cf06ffa2822496514e8b Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 8 Aug 2025 14:32:55 -0700 Subject: [PATCH 09/31] Try to fix segfault/bus error by holding onto temp dir --- optimum/executorch/modeling.py | 73 ++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 03cb7593..7b1c4c62 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -16,6 +16,8 @@ import logging import os +import tempfile +import shutil from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -102,6 +104,34 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon setattr(self, key, value) self.stats = Stats() + + # Initialize cleanup tracking + self._temp_dir = None + + def __del__(self): + """Clean up temporary files when the model instance is destroyed.""" + self._cleanup_temp_resources() + + def _cleanup_temp_resources(self): + """Clean up temporary directory and files.""" + if hasattr(self, '_temp_dir') and self._temp_dir is not None: + try: + if hasattr(self._temp_dir, 'cleanup'): + # It's a TemporaryDirectory object + logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}") + self._temp_dir.cleanup() + logging.info(f"Temporary directory cleanup completed") + elif isinstance(self._temp_dir, (str, Path)): + # It's a path + logging.info(f"Cleaning up temporary path: {self._temp_dir}") + shutil.rmtree(self._temp_dir, ignore_errors=True) + logging.info(f"Temporary path cleanup completed") + except Exception as e: + # Log cleanup errors for debugging + logging.warning(f"Error during temp directory cleanup: {e}") + pass + finally: + self._temp_dir = None @abstractmethod def forward(self, *args, **kwargs): @@ -250,7 +280,7 @@ def _export( inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class) logging.info(f"Inferred task from model class: {inferred_task}") - save_dir = TemporaryDirectory() + save_dir = TemporaryDirectory(prefix="executorch_export_") save_dir_path = Path(save_dir.name) # Export to ExecuTorch and save the pte file to the temporary directory @@ -274,7 +304,16 @@ def _export( for name, _ in executorch_progs.items(): models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config)) - return models + # Log temp directory info for debugging + logging.info(f"Created temporary directory: {save_dir_path}") + for name in executorch_progs.keys(): + pte_file = save_dir_path / f"{name}.pte" + if pte_file.exists(): + logging.info(f"PTE file exists at export: {pte_file} (size: {pte_file.stat().st_size} bytes)") + else: + logging.warning(f"PTE file missing at export: {pte_file}") + + return models, save_dir def _save_pretrained(self, save_directory): """ @@ -345,8 +384,9 @@ def from_pretrained( f"Could not infer whether the model was already converted or not to the ExecuTorch IR, keeping `export={export}`.\n{exception}" ) + temp_dir = None if _export: - models_dict = cls._export( + models_dict, temp_dir = cls._export( model_id=model_id, config=config, revision=revision, @@ -376,7 +416,14 @@ def from_pretrained( ) ) - return cls(models_dict, config) + model_instance = cls(models_dict, config) + + # Store the TemporaryDirectory reference to prevent GC + if temp_dir is not None: + model_instance._temp_dir = temp_dir + logging.info(f"Stored temp directory reference in model: {temp_dir.name}") + + return model_instance class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase): @@ -647,12 +694,30 @@ def forward( Returns: torch.Tensor: Logits output from the model. """ + # Check if temp directory and PTE file still exist before forward pass + if hasattr(self, '_temp_dir') and self._temp_dir is not None: + temp_path = Path(self._temp_dir.name) + logging.info(f"Forward pass - temp directory exists: {temp_path.exists()}") + if temp_path.exists(): + pte_files = list(temp_path.glob("*.pte")) + logging.info(f"Forward pass - PTE files found: {len(pte_files)}") + for pte_file in pte_files: + logging.info(f"Forward pass - PTE file: {pte_file} exists: {pte_file.exists()}, size: {pte_file.stat().st_size if pte_file.exists() else 'N/A'}") + else: + logging.error(f"Forward pass - temp directory missing: {temp_path}") + else: + logging.info("Forward pass - no temp directory reference stored") + self.stats.on_model_execution_start() try: logits = self.model.forward((input_ids, cache_position))[0] except Exception as e: shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")} + logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}") + if hasattr(self, '_temp_dir') and self._temp_dir is not None: + temp_path = Path(self._temp_dir.name) + logging.error(f"Forward pass failed - temp directory: {temp_path} exists: {temp_path.exists()}") print(f"Exception: {e}.\n{self.model.method_meta('forward')}\narg shapes: {shapes}") raise From bb0089cd34ab859c1467d0d8b12b702b1b6f9c2c Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 8 Aug 2025 19:03:06 -0700 Subject: [PATCH 10/31] Bigger mac runners --- .github/workflows/test_models.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml index af830a02..bf83331e 100644 --- a/.github/workflows/test_models.yml +++ b/.github/workflows/test_models.yml @@ -36,7 +36,7 @@ jobs: test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} executorch-version: ['0.6.0', 'nightly'] python-version: ['3.11'] - os: [macos-15, ubuntu-22.04] + os: [macos-15-large, ubuntu-22.04] # Custom job name, now shortened and cleaner name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }}) From 72802e3453d616d05d913f976c8561f95eae9ec0 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Sun, 10 Aug 2025 12:11:15 -0700 Subject: [PATCH 11/31] Revert "Bigger mac runners" This reverts commit 27097140cd11a11a886dfb14846893b0c31608a3. --- .github/workflows/test_models.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml index bf83331e..af830a02 100644 --- a/.github/workflows/test_models.yml +++ b/.github/workflows/test_models.yml @@ -36,7 +36,7 @@ jobs: test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} executorch-version: ['0.6.0', 'nightly'] python-version: ['3.11'] - os: [macos-15-large, ubuntu-22.04] + os: [macos-15, ubuntu-22.04] # Custom job name, now shortened and cleaner name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }}) From 9876c7e850ad816a044223d26dd6e2a40ed27219 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Sun, 10 Aug 2025 12:55:58 -0700 Subject: [PATCH 12/31] Add helpful logs --- optimum/executorch/modeling.py | 7 +++++-- optimum/exporters/executorch/__main__.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 7b1c4c62..55bfa477 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -345,6 +345,7 @@ def from_pretrained( logger.info("Offline mode: setting `local_files_only=True`") local_files_only = True + # See if model was already exported to ExecuTorch and uplaoded to the HuggingFace repo. _export = export try: if local_files_only and not os.path.isdir(model_id): @@ -371,13 +372,11 @@ def from_pretrained( if export: logger.warning( f"The model {model_id} was already converted to the ExecuTorch IR but got `export=True`, the model will be converted to ExecuTorch once again. " - # "Don't forget to save the resulting model with `.save_pretrained()`" ) _export = True else: logger.warning( f"No ExecuTorch files were found for {model_id}, setting `export=True` to convert the model to the ExecuTorch IR. " - # "Don't forget to save the resulting model with `.save_pretrained()`" ) except Exception as exception: logger.warning( @@ -386,6 +385,7 @@ def from_pretrained( temp_dir = None if _export: + logging.info(f"Exporting {model_id} to ExecuTorch program...") models_dict, temp_dir = cls._export( model_id=model_id, config=config, @@ -399,6 +399,7 @@ def from_pretrained( **kwargs, ) else: + logging.info(f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export.") models_dict = {} for pte_file in pte_files: models_dict.update( @@ -711,7 +712,9 @@ def forward( self.stats.on_model_execution_start() try: + logging.info("Running forward()...") logits = self.model.forward((input_ids, cache_position))[0] + logging.info(f"logits from forward(): {logits}") except Exception as e: shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")} logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}") diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py index 7a9fe9c8..d505af64 100644 --- a/optimum/exporters/executorch/__main__.py +++ b/optimum/exporters/executorch/__main__.py @@ -15,6 +15,7 @@ """Entry point to the optimum.exporters.executorch command line.""" import argparse +import logging import os import warnings from pathlib import Path @@ -130,13 +131,15 @@ def main_export( kwargs["force_download"] = force_download kwargs["config"] = config + # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram). + logging.info(f"Loading {model_name_or_path} and exporting to static graph...") recipe_kwargs = kwargs.pop("recipe_kwargs", {}) - model = task_func(model_name_or_path, **kwargs) + # 2. Export to ExecuTorch through ExecuTorch's lowering APIs. + logging.info(f"Lowering {model_name_or_path} to ExecuTorch...") if not os.path.exists(output_dir): os.makedirs(output_dir) - return export_to_executorch( model=model, task=task, From 19f4d21bf780851c7b96db447c377118208b6a0e Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Sun, 10 Aug 2025 13:41:44 -0700 Subject: [PATCH 13/31] Re-enable smollm3 tests for linux --- tests/models/test_modeling_smollm3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py index 34a63510..10dce8ee 100644 --- a/tests/models/test_modeling_smollm3.py +++ b/tests/models/test_modeling_smollm3.py @@ -36,7 +36,7 @@ @pytest.mark.skipif( - is_transformers_version("<", "4.53.1") or is_linux_ci, + is_transformers_version("<", "4.53.1"), reason="Only available on transformers >= 4.53.1", ) class ExecuTorchModelIntegrationTest(unittest.TestCase): From 99805f82be3461bb9bd269a29042c111f79ef079 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Sun, 10 Aug 2025 14:56:10 -0700 Subject: [PATCH 14/31] Experiment reverting transformers bump --- install_dev.py | 2 +- .../executorch/attentions/custom_kv_cache.py | 67 ++++++++++--------- optimum/exporters/executorch/integrations.py | 4 +- optimum/exporters/executorch/utils.py | 4 +- setup.py | 2 +- 5 files changed, 41 insertions(+), 38 deletions(-) diff --git a/install_dev.py b/install_dev.py index 3fac4546..486edb3c 100644 --- a/install_dev.py +++ b/install_dev.py @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 + "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index a9d912ab..b5416f87 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for layer in self.layers: + for _ in range(config.num_hidden_layers): layer_cache = CustomKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.max_cache_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,29 +202,32 @@ def __init__( layer_device_map=layer_device_map, ) + # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer. - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. + # Create a list of cache instances, one per layer + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers self.kv_cache = torch.nn.ModuleList() - for layer in self.layers: - if layer.is_sliding: + for layer_idx in range(config.num_hidden_layers): + # newer version of transfomer has is_sliding defined + # for HybridCache + if self.is_sliding[layer_idx]: # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.sliding_window_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.max_cache_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -281,7 +284,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.layers[layer_idx].is_sliding: + if self.is_sliding[layer_idx]: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -305,7 +308,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. + Replace all KV caches in the module with ETCustomStaticCache. This modifies the model in place. Args: @@ -339,18 +342,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) # Dont know why we need to this even though @@ -367,25 +370,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.layers[i].is_sliding: + if module.cache.is_sliding[i]: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 35406190..e6447562 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), - batch_size=self.generation_config.cache_config.get("batch_size"), + max_static_cache_length=self.generation_config.cache_config.max_cache_len, + batch_size=self.generation_config.cache_config.batch_size, ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index e805ad4e..70447957 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = cache_config.get("batch_size") - max_seq_len = cache_config.get("max_cache_len") + max_batch_size = getattr(cache_config, "batch_size", None) + max_seq_len = getattr(cache_config, "max_cache_len", None) if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index 7d447fa6..c7fa93ed 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.54.1", + "transformers==4.51.3", ] TESTS_REQUIRE = [ From 108ed173f63a028618021c9c928cbaa89aa45867 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:56:04 -0700 Subject: [PATCH 15/31] Revert "Experiment reverting transformers bump" This reverts commit 99805f82be3461bb9bd269a29042c111f79ef079. --- install_dev.py | 2 +- .../executorch/attentions/custom_kv_cache.py | 67 +++++++++---------- optimum/exporters/executorch/integrations.py | 4 +- optimum/exporters/executorch/utils.py | 4 +- setup.py | 2 +- 5 files changed, 38 insertions(+), 41 deletions(-) diff --git a/install_dev.py b/install_dev.py index 486edb3c..3fac4546 100644 --- a/install_dev.py +++ b/install_dev.py @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.54.1 + "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index b5416f87..a9d912ab 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for _ in range(config.num_hidden_layers): + for layer in self.layers: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,32 +202,29 @@ def __init__( layer_device_map=layer_device_map, ) - # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers + # Create a list of cache instances, one per layer. + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. self.kv_cache = torch.nn.ModuleList() - for layer_idx in range(config.num_hidden_layers): - # newer version of transfomer has is_sliding defined - # for HybridCache - if self.is_sliding[layer_idx]: + for layer in self.layers: + if layer.is_sliding: # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.sliding_window_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.is_sliding[layer_idx]: + if self.layers[layer_idx].is_sliding: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache. + Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. This modifies the model in place. Args: @@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Dont know why we need to this even though @@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.is_sliding[i]: + if module.cache.layers[i].is_sliding: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index e6447562..35406190 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.max_cache_len, - batch_size=self.generation_config.cache_config.batch_size, + max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), + batch_size=self.generation_config.cache_config.get("batch_size"), ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index 70447957..e805ad4e 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = getattr(cache_config, "batch_size", None) - max_seq_len = getattr(cache_config, "max_cache_len", None) + max_batch_size = cache_config.get("batch_size") + max_seq_len = cache_config.get("max_cache_len") if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index c7fa93ed..7d447fa6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.51.3", + "transformers==4.54.1", ] TESTS_REQUIRE = [ From 59778ebe5c06ec0202a7ed3b0d7b6324a45a7274 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 13 Aug 2025 14:12:40 -0700 Subject: [PATCH 16/31] Formatting and remove logs --- optimum/executorch/modeling.py | 55 ++++++---------------------------- 1 file changed, 9 insertions(+), 46 deletions(-) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 55bfa477..80763533 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -104,19 +104,19 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon setattr(self, key, value) self.stats = Stats() - + # Initialize cleanup tracking self._temp_dir = None def __del__(self): """Clean up temporary files when the model instance is destroyed.""" self._cleanup_temp_resources() - + def _cleanup_temp_resources(self): """Clean up temporary directory and files.""" - if hasattr(self, '_temp_dir') and self._temp_dir is not None: + if hasattr(self, "_temp_dir") and self._temp_dir is not None: try: - if hasattr(self._temp_dir, 'cleanup'): + if hasattr(self._temp_dir, "cleanup"): # It's a TemporaryDirectory object logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}") self._temp_dir.cleanup() @@ -216,17 +216,7 @@ def _from_pretrained( subfolder=subfolder, local_files_only=local_files_only, ) - - from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa - from executorch.kernels import quantized # noqa - from executorch.extension.pybindings.portable_lib import _get_operator_names - print("----------- LOADED OPS ----------") - print('\n'.join(_get_operator_names())) - print("---------------------------------") model = _load_for_executorch(model_cache_path) - logging.info( - f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)" - ) return {default_file_name.removesuffix(_PTE_SUFFIX): model} @@ -304,15 +294,6 @@ def _export( for name, _ in executorch_progs.items(): models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config)) - # Log temp directory info for debugging - logging.info(f"Created temporary directory: {save_dir_path}") - for name in executorch_progs.keys(): - pte_file = save_dir_path / f"{name}.pte" - if pte_file.exists(): - logging.info(f"PTE file exists at export: {pte_file} (size: {pte_file.stat().st_size} bytes)") - else: - logging.warning(f"PTE file missing at export: {pte_file}") - return models, save_dir def _save_pretrained(self, save_directory): @@ -399,7 +380,9 @@ def from_pretrained( **kwargs, ) else: - logging.info(f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export.") + logging.info( + f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export." + ) models_dict = {} for pte_file in pte_files: models_dict.update( @@ -418,12 +401,12 @@ def from_pretrained( ) model_instance = cls(models_dict, config) - + # Store the TemporaryDirectory reference to prevent GC if temp_dir is not None: model_instance._temp_dir = temp_dir logging.info(f"Stored temp directory reference in model: {temp_dir.name}") - + return model_instance @@ -695,32 +678,12 @@ def forward( Returns: torch.Tensor: Logits output from the model. """ - # Check if temp directory and PTE file still exist before forward pass - if hasattr(self, '_temp_dir') and self._temp_dir is not None: - temp_path = Path(self._temp_dir.name) - logging.info(f"Forward pass - temp directory exists: {temp_path.exists()}") - if temp_path.exists(): - pte_files = list(temp_path.glob("*.pte")) - logging.info(f"Forward pass - PTE files found: {len(pte_files)}") - for pte_file in pte_files: - logging.info(f"Forward pass - PTE file: {pte_file} exists: {pte_file.exists()}, size: {pte_file.stat().st_size if pte_file.exists() else 'N/A'}") - else: - logging.error(f"Forward pass - temp directory missing: {temp_path}") - else: - logging.info("Forward pass - no temp directory reference stored") - self.stats.on_model_execution_start() try: - logging.info("Running forward()...") logits = self.model.forward((input_ids, cache_position))[0] - logging.info(f"logits from forward(): {logits}") except Exception as e: shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")} - logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}") - if hasattr(self, '_temp_dir') and self._temp_dir is not None: - temp_path = Path(self._temp_dir.name) - logging.error(f"Forward pass failed - temp directory: {temp_path} exists: {temp_path.exists()}") print(f"Exception: {e}.\n{self.model.method_meta('forward')}\narg shapes: {shapes}") raise From ff8a2a1fab745987fc1798c47539fb2a855c19bf Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:25:15 -0700 Subject: [PATCH 17/31] Bump ET release from 0.6 -> 0.7 --- .github/workflows/test_models.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml index af830a02..13b61c12 100644 --- a/.github/workflows/test_models.yml +++ b/.github/workflows/test_models.yml @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} - executorch-version: ['0.6.0', 'nightly'] + executorch-version: ['0.7.0', 'nightly'] python-version: ['3.11'] os: [macos-15, ubuntu-22.04] From a3009ca979c41b2335cd358e5375538d52c6007a Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:46:57 -0700 Subject: [PATCH 18/31] Bisect down to ET 20250701 --- install_dev.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_dev.py b/install_dev.py index 3fac4546..1f7a0bb7 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250730" + EXECUTORCH_NIGHTLY_VERSION = "dev20250701" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" From ae488b1bd135d2e0c16a20a4da0aeaf0c1a8a147 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Sun, 10 Aug 2025 14:56:10 -0700 Subject: [PATCH 19/31] Experiment reverting transformers bump --- install_dev.py | 2 +- .../executorch/attentions/custom_kv_cache.py | 67 ++++++++++--------- optimum/exporters/executorch/integrations.py | 4 +- optimum/exporters/executorch/utils.py | 4 +- setup.py | 2 +- 5 files changed, 41 insertions(+), 38 deletions(-) diff --git a/install_dev.py b/install_dev.py index 1f7a0bb7..bdf54083 100644 --- a/install_dev.py +++ b/install_dev.py @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 + "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index a9d912ab..b5416f87 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for layer in self.layers: + for _ in range(config.num_hidden_layers): layer_cache = CustomKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.max_cache_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,29 +202,32 @@ def __init__( layer_device_map=layer_device_map, ) + # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer. - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. + # Create a list of cache instances, one per layer + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers self.kv_cache = torch.nn.ModuleList() - for layer in self.layers: - if layer.is_sliding: + for layer_idx in range(config.num_hidden_layers): + # newer version of transfomer has is_sliding defined + # for HybridCache + if self.is_sliding[layer_idx]: # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.sliding_window_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=layer.max_batch_size, - max_context_length=layer.max_cache_len, - n_heads=layer.num_heads, - head_dim=layer.head_dim, + max_batch_size=self.max_batch_size, + max_context_length=self.max_cache_len, + n_heads=self.num_key_value_heads, + head_dim=self.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -281,7 +284,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.layers[layer_idx].is_sliding: + if self.is_sliding[layer_idx]: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -305,7 +308,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. + Replace all KV caches in the module with ETCustomStaticCache. This modifies the model in place. Args: @@ -339,18 +342,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) # Dont know why we need to this even though @@ -367,25 +370,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.get("batch_size"), - max_cache_len=generation_config.cache_config.get("max_cache_len"), - device=generation_config.cache_config.get("device"), + max_batch_size=generation_config.cache_config.batch_size, + max_cache_len=generation_config.cache_config.max_cache_len, + device=generation_config.cache_config.device, dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.layers[i].is_sliding: + if module.cache.is_sliding[i]: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 35406190..e6447562 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), - batch_size=self.generation_config.cache_config.get("batch_size"), + max_static_cache_length=self.generation_config.cache_config.max_cache_len, + batch_size=self.generation_config.cache_config.batch_size, ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index e805ad4e..70447957 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = cache_config.get("batch_size") - max_seq_len = cache_config.get("max_cache_len") + max_batch_size = getattr(cache_config, "batch_size", None) + max_seq_len = getattr(cache_config, "max_cache_len", None) if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index 7d447fa6..c7fa93ed 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.54.1", + "transformers==4.51.3", ] TESTS_REQUIRE = [ From b7a2fa1361cc2c66690608f4930d99ea94c30330 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:49:13 -0700 Subject: [PATCH 20/31] Clean --- optimum/executorch/modeling.py | 3 +++ optimum/exporters/executorch/__main__.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 80763533..fee2f1a2 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -217,6 +217,9 @@ def _from_pretrained( local_files_only=local_files_only, ) model = _load_for_executorch(model_cache_path) + logging.info( + f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)" + ) return {default_file_name.removesuffix(_PTE_SUFFIX): model} diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py index d505af64..854bf260 100644 --- a/optimum/exporters/executorch/__main__.py +++ b/optimum/exporters/executorch/__main__.py @@ -134,12 +134,14 @@ def main_export( # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram). logging.info(f"Loading {model_name_or_path} and exporting to static graph...") recipe_kwargs = kwargs.pop("recipe_kwargs", {}) + model = task_func(model_name_or_path, **kwargs) # 2. Export to ExecuTorch through ExecuTorch's lowering APIs. logging.info(f"Lowering {model_name_or_path} to ExecuTorch...") if not os.path.exists(output_dir): os.makedirs(output_dir) + return export_to_executorch( model=model, task=task, From 1e0a67124bafa6aff27689ad40c5ee302ff63d22 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:50:31 -0700 Subject: [PATCH 21/31] Bisect down to ET 20250628 --- install_dev.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install_dev.py b/install_dev.py index bdf54083..4ce4fe2b 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250701" + EXECUTORCH_NIGHTLY_VERSION = "dev20250628" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" @@ -15,7 +15,7 @@ def install_torch_nightly_deps(): "-m", "pip", "install", - f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}", + f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}", f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", From 896f0da22f13a139cfd7f2a92b31ea6d8c13d20b Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:57:44 -0700 Subject: [PATCH 22/31] Bisect down to ET 20250626 --- install_dev.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_dev.py b/install_dev.py index 4ce4fe2b..ae4c8a8b 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250628" + EXECUTORCH_NIGHTLY_VERSION = "dev20250626" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" From abd641b44d2ffabab2858044f117d6d96169e34b Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:10:50 -0700 Subject: [PATCH 23/31] Revert "Bisect down to ET 20250626" This reverts commit 896f0da22f13a139cfd7f2a92b31ea6d8c13d20b. --- install_dev.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_dev.py b/install_dev.py index ae4c8a8b..4ce4fe2b 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250626" + EXECUTORCH_NIGHTLY_VERSION = "dev20250628" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" From 7f7f9c2c7862a9b49ab6c513eb84d47b27fc279f Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:11:05 -0700 Subject: [PATCH 24/31] Revert "Bisect down to ET 20250628" This reverts commit 1e0a67124bafa6aff27689ad40c5ee302ff63d22. --- install_dev.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install_dev.py b/install_dev.py index 4ce4fe2b..bdf54083 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250628" + EXECUTORCH_NIGHTLY_VERSION = "dev20250701" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" @@ -15,7 +15,7 @@ def install_torch_nightly_deps(): "-m", "pip", "install", - f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}", + f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}", f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", From 5f8a56f4a10d97191ddaf300c0d26aa36e031f55 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:11:25 -0700 Subject: [PATCH 25/31] Revert "Experiment reverting transformers bump" This reverts commit ae488b1bd135d2e0c16a20a4da0aeaf0c1a8a147. --- install_dev.py | 2 +- .../executorch/attentions/custom_kv_cache.py | 67 +++++++++---------- optimum/exporters/executorch/integrations.py | 4 +- optimum/exporters/executorch/utils.py | 4 +- setup.py | 2 +- 5 files changed, 38 insertions(+), 41 deletions(-) diff --git a/install_dev.py b/install_dev.py index bdf54083..1f7a0bb7 100644 --- a/install_dev.py +++ b/install_dev.py @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.54.1 + "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index b5416f87..a9d912ab 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for _ in range(config.num_hidden_layers): + for layer in self.layers: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,32 +202,29 @@ def __init__( layer_device_map=layer_device_map, ) - # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers + # Create a list of cache instances, one per layer. + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. self.kv_cache = torch.nn.ModuleList() - for layer_idx in range(config.num_hidden_layers): - # newer version of transfomer has is_sliding defined - # for HybridCache - if self.is_sliding[layer_idx]: + for layer in self.layers: + if layer.is_sliding: # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.sliding_window_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.is_sliding[layer_idx]: + if self.layers[layer_idx].is_sliding: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache. + Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. This modifies the model in place. Args: @@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Dont know why we need to this even though @@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.is_sliding[i]: + if module.cache.layers[i].is_sliding: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index e6447562..35406190 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.max_cache_len, - batch_size=self.generation_config.cache_config.batch_size, + max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), + batch_size=self.generation_config.cache_config.get("batch_size"), ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index 70447957..e805ad4e 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = getattr(cache_config, "batch_size", None) - max_seq_len = getattr(cache_config, "max_cache_len", None) + max_batch_size = cache_config.get("batch_size") + max_seq_len = cache_config.get("max_cache_len") if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index c7fa93ed..7d447fa6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.51.3", + "transformers==4.54.1", ] TESTS_REQUIRE = [ From 92bc2ba3f0659c7681a3106b123b75a0017c92ce Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:11:36 -0700 Subject: [PATCH 26/31] Revert "Bisect down to ET 20250701" This reverts commit a3009ca979c41b2335cd358e5375538d52c6007a. --- install_dev.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_dev.py b/install_dev.py index 1f7a0bb7..3fac4546 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,7 +5,7 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250701" + EXECUTORCH_NIGHTLY_VERSION = "dev20250730" TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 TORCH_NIGHTLY_VERSION = "dev20250725" From 4abb2eccf1e60267b84f1f2a956ace4e3b3eb222 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:43:21 -0700 Subject: [PATCH 27/31] Skip mac tests --- .github/workflows/test_models.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml index 13b61c12..a72dba55 100644 --- a/.github/workflows/test_models.yml +++ b/.github/workflows/test_models.yml @@ -36,7 +36,8 @@ jobs: test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} executorch-version: ['0.7.0', 'nightly'] python-version: ['3.11'] - os: [macos-15, ubuntu-22.04] + # os: [macos-15, ubuntu-22.04] # TODO(#122): Re-enable the mac tests after fixing seg fault. + os: [ubuntu-22.04] # Custom job name, now shortened and cleaner name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }}) From ad9b639461ae7e849e7b3b255557f0fa3f52476c Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:44:19 -0700 Subject: [PATCH 28/31] Remove unnecessary ET 0.6 guards --- optimum/exporters/executorch/integrations.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 35406190..532aa4d4 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -90,10 +90,7 @@ def _prepare_export_inputs(self): return example_input_ids, example_cache_position, dynamic_shapes, strict def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module): - if ( - is_transformers_version(">=", "4.53.0.dev0") - and parse(executorch_version.__version__).base_version > "0.6.0" - ): + if is_transformers_version(">=", "4.53.0.dev0"): from transformers.integrations.executorch import sdpa_mask_without_vmap from transformers.masking_utils import AttentionMaskInterface from transformers.modeling_utils import AttentionInterface @@ -130,7 +127,7 @@ def export( ) self._register_attention_mask_for_4_53(exportable_module) - if self.use_custom_kv_cache and parse(executorch_version.__version__).base_version > "0.6.0": + if self.use_custom_kv_cache: from optimum.executorch.attentions.custom_kv_cache import ( replace_with_et_custom_kv_cache, ) From b252038b4feec37e775fb96f7ba740f2a6726b07 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 15 Aug 2025 09:22:27 -0700 Subject: [PATCH 29/31] Ruff format --- optimum/executorch/modeling.py | 5 ++--- optimum/exporters/executorch/integrations.py | 1 - optimum/exporters/executorch/quantization.py | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index fee2f1a2..58218e51 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -16,7 +16,6 @@ import logging import os -import tempfile import shutil from abc import ABC, abstractmethod from pathlib import Path @@ -120,12 +119,12 @@ def _cleanup_temp_resources(self): # It's a TemporaryDirectory object logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}") self._temp_dir.cleanup() - logging.info(f"Temporary directory cleanup completed") + logging.info("Temporary directory cleanup completed") elif isinstance(self._temp_dir, (str, Path)): # It's a path logging.info(f"Cleaning up temporary path: {self._temp_dir}") shutil.rmtree(self._temp_dir, ignore_errors=True) - logging.info(f"Temporary path cleanup completed") + logging.info("Temporary path cleanup completed") except Exception as e: # Log cleanup errors for debugging logging.warning(f"Error during temp directory cleanup: {e}") diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 532aa4d4..06522c27 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -28,7 +28,6 @@ ) from transformers.generation.configuration_utils import GenerationConfig -from executorch import version as executorch_version from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache from optimum.utils.import_utils import is_transformers_version diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py index 299a2ddc..8994fd41 100644 --- a/optimum/exporters/executorch/quantization.py +++ b/optimum/exporters/executorch/quantization.py @@ -16,8 +16,6 @@ from typing import Optional import torch -import torchao -from packaging.version import parse def quantize_model_( From e1353103e2da61b1e70e3af634651ae527f1bad0 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 15 Aug 2025 11:19:36 -0700 Subject: [PATCH 30/31] Remove all transformers < 4.54 guards --- optimum/exporters/executorch/convert.py | 9 +- optimum/exporters/executorch/integrations.py | 115 ++++++++---------- tests/models/test_modeling_gemma3.py | 20 --- tests/models/test_modeling_phi4.py | 7 -- tests/models/test_modeling_qwen3.py | 9 -- tests/models/test_modeling_qwen3_embedding.py | 6 - tests/models/test_modeling_smollm3.py | 6 +- tests/models/test_modeling_whisper.py | 5 - 8 files changed, 57 insertions(+), 120 deletions(-) diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py index 612b07fb..8a381423 100644 --- a/optimum/exporters/executorch/convert.py +++ b/optimum/exporters/executorch/convert.py @@ -19,20 +19,17 @@ from pathlib import Path from typing import Union +from transformers.integrations.executorch import sdpa_mask_without_vmap +from transformers.masking_utils import AttentionMaskInterface from transformers.modeling_utils import AttentionInterface from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward -from optimum.utils.import_utils import is_transformers_version from .recipe_registry import discover_recipes, recipe_registry AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward) -if is_transformers_version(">=", "4.53.0.dev0"): - from transformers.integrations.executorch import sdpa_mask_without_vmap - from transformers.masking_utils import AttentionMaskInterface - - AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap) +AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap) def export_to_executorch( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 06522c27..954a95da 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -29,7 +29,6 @@ from transformers.generation.configuration_utils import GenerationConfig from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache -from optimum.utils.import_utils import is_transformers_version from .utils import save_config_to_constant_methods @@ -71,7 +70,7 @@ def _prepare_export_inputs(self): and not (self.use_custom_kv_cache and self.use_custom_sdpa) ) - if is_transformers_version(">", "4.52.0") and not is_using_hybrid_cache_wo_custom_sdpa_kv_cache: + if not is_using_hybrid_cache_wo_custom_sdpa_kv_cache: # Prepare inputs with dynamic shapes seq_length = 3 # Sequence length > 1 to avoid specialization issues example_input_ids = torch.zeros((1, seq_length), dtype=torch.long) @@ -88,24 +87,23 @@ def _prepare_export_inputs(self): return example_input_ids, example_cache_position, dynamic_shapes, strict - def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module): - if is_transformers_version(">=", "4.53.0.dev0"): - from transformers.integrations.executorch import sdpa_mask_without_vmap - from transformers.masking_utils import AttentionMaskInterface - from transformers.modeling_utils import AttentionInterface - - _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module) - if self.use_custom_sdpa: - if self.use_custom_kv_cache: - AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache) - AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap) - # Manually set the attention implementation to custom_sdpa_ring_kv_cache - # This handles both regular sdpa and one for sliding window/local attention - exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache" - else: - # Manually set the attention implementation to custom_sdpa_ring_kv_cache - # This handles both regular sdpa and one for sliding window/local attention - exportable_module.model.model.config._attn_implementation = "custom_sdpa" + def _register_custom_attention(self, exportable_module: torch.nn.Module): + from transformers.integrations.executorch import sdpa_mask_without_vmap + from transformers.masking_utils import AttentionMaskInterface + from transformers.modeling_utils import AttentionInterface + + _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module) + if self.use_custom_sdpa: + if self.use_custom_kv_cache: + AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache) + AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap) + # Manually set the attention implementation to custom_sdpa_ring_kv_cache + # This handles both regular sdpa and one for sliding window/local attention + exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache" + else: + # Manually set the attention implementation to custom_sdpa_ring_kv_cache + # This handles both regular sdpa and one for sliding window/local attention + exportable_module.model.model.config._attn_implementation = "custom_sdpa" def export( self, @@ -114,55 +112,48 @@ def export( logging.info( f"Exporting using input_ids({input_ids.shape})={input_ids}, cache_position({cache_position.shape})={cache_position}, dynamic_shapes={dynamic_shapes}, strict={strict}" ) - if is_transformers_version(">", "4.52.0"): - from transformers.integrations.executorch import ( - TorchExportableModuleForDecoderOnlyLM, - ) - exportable_module = TorchExportableModuleForDecoderOnlyLM( - self.model, - max_batch_size=1, - max_cache_len=self.metadata.get("get_max_seq_len"), - ) - self._register_attention_mask_for_4_53(exportable_module) + from transformers.integrations.executorch import ( + TorchExportableModuleForDecoderOnlyLM, + ) - if self.use_custom_kv_cache: - from optimum.executorch.attentions.custom_kv_cache import ( - replace_with_et_custom_kv_cache, - ) + exportable_module = TorchExportableModuleForDecoderOnlyLM( + self.model, + max_batch_size=1, + max_cache_len=self.metadata.get("get_max_seq_len"), + ) + self._register_custom_attention(exportable_module) - replace_with_et_custom_kv_cache( - exportable_module.model, - self.model.config, - self.model.generation_config, - self.model.dtype, - ) + if self.use_custom_kv_cache: + from optimum.executorch.attentions.custom_kv_cache import ( + replace_with_et_custom_kv_cache, + ) - with torch.no_grad(): - exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict) - # Apply RemoveTransposes pass to remove - # any back-to-back transpose ops that are not needed - # e.g. output of update_cache is transposed and - # input to custom_sdpa is transposed. - from executorch.extension.llm.export.export_passes import ( - RemoveRedundantTransposes, - ) + replace_with_et_custom_kv_cache( + exportable_module.model, + self.model.config, + self.model.generation_config, + self.model.dtype, + ) - mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0] - exported_program = torch.export.export( - mutated_gm, - args=(input_ids, cache_position), - kwargs={}, - dynamic_shapes=dynamic_shapes, - strict=strict, - ) - else: - # Path to use legacy API, static export only due to pinned transformers version - from transformers.integrations.executorch import ( - convert_and_export_with_cache, + with torch.no_grad(): + exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict) + # Apply RemoveTransposes pass to remove + # any back-to-back transpose ops that are not needed + # e.g. output of update_cache is transposed and + # input to custom_sdpa is transposed. + from executorch.extension.llm.export.export_passes import ( + RemoveRedundantTransposes, ) - exported_program = convert_and_export_with_cache(self.model, input_ids, cache_position) + mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0] + exported_program = torch.export.export( + mutated_gm, + args=(input_ids, cache_position), + kwargs={}, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) return {"model": exported_program} diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py index 56666c04..11e91174 100644 --- a/tests/models/test_modeling_gemma3.py +++ b/tests/models/test_modeling_gemma3.py @@ -22,15 +22,11 @@ import unittest import pytest -import torchao -import transformers from executorch.extension.pybindings.portable_lib import ExecuTorchModule -from packaging.version import parse from transformers import AutoTokenizer from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForCausalLM -from optimum.utils.import_utils import is_transformers_version from ..utils import check_causal_lm_output_quality @@ -41,10 +37,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" -@pytest.mark.skipif( - is_transformers_version("<", "4.52.0.dev0"), - reason="Only available on transformers >= 4.52.0.dev0", -) class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -119,10 +111,6 @@ def test_gemma3_text_generation_portable(self): @slow @pytest.mark.run_slow @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner") - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" @@ -191,10 +179,6 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" @@ -230,10 +214,6 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py index b0851444..fa1c9be6 100644 --- a/tests/models/test_modeling_phi4.py +++ b/tests/models/test_modeling_phi4.py @@ -21,7 +21,6 @@ import pytest import torchao -import transformers from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse from transformers import AutoConfig, AutoTokenizer @@ -43,12 +42,6 @@ def __init__(self, *args, **kwargs): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - is_linux_ci - or parse(transformers.__version__) < parse("4.52.0") - or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0. OOM on linux runner.", - ) def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "microsoft/Phi-4-mini-instruct" model = ExecuTorchModelForCausalLM.from_pretrained( diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py index 0f0d8a36..4deacac8 100644 --- a/tests/models/test_modeling_qwen3.py +++ b/tests/models/test_modeling_qwen3.py @@ -23,7 +23,6 @@ import pytest import torchao -import transformers from executorch import version from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse @@ -214,10 +213,6 @@ def test_qwen3_text_generation_with_custom_sdpa_8da4w_8we(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0.dev0"), - reason="Only available on transformers >= 4.52.0.dev0", - ) def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self): model_id = "Qwen/Qwen3-0.6B" prompt = "Give me a short introduction to large language model." @@ -249,10 +244,6 @@ def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0", - ) def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "Qwen/Qwen3-0.6B" prompt = "Give me a short introduction to large language model." diff --git a/tests/models/test_modeling_qwen3_embedding.py b/tests/models/test_modeling_qwen3_embedding.py index 73002e9c..0146634f 100644 --- a/tests/models/test_modeling_qwen3_embedding.py +++ b/tests/models/test_modeling_qwen3_embedding.py @@ -19,8 +19,6 @@ import unittest import pytest -import torchao -import transformers from executorch import version from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse @@ -41,10 +39,6 @@ def __init__(self, *args, **kwargs): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0", - ) def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "Qwen/Qwen3-Embedding-0.6B" prompt = "Explain gravity" diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py index 10dce8ee..12eb3c63 100644 --- a/tests/models/test_modeling_smollm3.py +++ b/tests/models/test_modeling_smollm3.py @@ -25,7 +25,6 @@ from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForCausalLM -from optimum.utils.import_utils import is_transformers_version from ..utils import check_causal_lm_output_quality @@ -35,10 +34,7 @@ is_linux_ci = sys.platform.startswith("linux") and is_ci -@pytest.mark.skipif( - is_transformers_version("<", "4.53.1"), - reason="Only available on transformers >= 4.53.1", -) +@pytest.mark.skipif(is_linux_ci) class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/tests/models/test_modeling_whisper.py b/tests/models/test_modeling_whisper.py index 3ecfc1c7..e88cb80f 100644 --- a/tests/models/test_modeling_whisper.py +++ b/tests/models/test_modeling_whisper.py @@ -28,16 +28,11 @@ from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForSpeechSeq2Seq -from optimum.utils.import_utils import is_transformers_version os.environ["TOKENIZERS_PARALLELISM"] = "false" -@pytest.mark.skipif( - is_transformers_version(">", "4.52.4"), - reason="Need to fix in the transformers due to attention refactor https://github.com/huggingface/transformers/pull/38235", -) class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 70338e914822ef60373952d159508b53d986cda1 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:09:14 -0700 Subject: [PATCH 31/31] Format --- tests/models/test_modeling_smollm3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py index 12eb3c63..1f3e26a2 100644 --- a/tests/models/test_modeling_smollm3.py +++ b/tests/models/test_modeling_smollm3.py @@ -34,7 +34,7 @@ is_linux_ci = sys.platform.startswith("linux") and is_ci -@pytest.mark.skipif(is_linux_ci) +@pytest.mark.skipif(is_linux_ci, reason="Runner OOM") class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -72,7 +72,7 @@ def test_smollm3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): @slow @pytest.mark.run_slow @pytest.mark.portable - @pytest.mark.skipif(is_ci, reason="Too big for CI runners") + @pytest.mark.skipif(is_ci, reason="Runner OOM") def test_smollm3_text_generation_portable(self): model_id = "HuggingFaceTB/SmolLM3-3B" prompt = "Give me a brief explanation of gravity in simple terms."