diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml index af830a02..a72dba55 100644 --- a/.github/workflows/test_models.yml +++ b/.github/workflows/test_models.yml @@ -34,9 +34,10 @@ jobs: fail-fast: false matrix: test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} - executorch-version: ['0.6.0', 'nightly'] + executorch-version: ['0.7.0', 'nightly'] python-version: ['3.11'] - os: [macos-15, ubuntu-22.04] + # os: [macos-15, ubuntu-22.04] # TODO(#122): Re-enable the mac tests after fixing seg fault. + os: [ubuntu-22.04] # Custom job name, now shortened and cleaner name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }}) diff --git a/install_dev.py b/install_dev.py index 5cf313ff..3fac4546 100644 --- a/install_dev.py +++ b/install_dev.py @@ -5,21 +5,21 @@ def install_torch_nightly_deps(): """Install torch related dependencies from pinned nightly""" - EXECUTORCH_NIGHTLY_VERSION = "dev20250625" - TORCHAO_NIGHTLY_VERSION = "dev20250620" + EXECUTORCH_NIGHTLY_VERSION = "dev20250730" + TORCHAO_NIGHTLY_VERSION = "dev20250730" # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74 - TORCH_NIGHTLY_VERSION = "dev20250601" + TORCH_NIGHTLY_VERSION = "dev20250725" subprocess.check_call( [ sys.executable, "-m", "pip", "install", - f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}", - f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}", - f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}", + f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}", + f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}", + f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}", f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}", - f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}", + f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}", "--extra-index-url", "https://download.pytorch.org/whl/nightly/cpu", ] @@ -34,7 +34,7 @@ def install_dep_from_source(): "-m", "pip", "install", - "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers", # 4.53.1 + "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers", # 4.54.1 ] ) subprocess.check_call( diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py index b5416f87..a9d912ab 100644 --- a/optimum/executorch/attentions/custom_kv_cache.py +++ b/optimum/executorch/attentions/custom_kv_cache.py @@ -54,12 +54,12 @@ def __init__( # Create a list of CustomKVCache instances, one per layer self.kv_cache = torch.nn.ModuleList() - for _ in range(config.num_hidden_layers): + for layer in self.layers: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -202,32 +202,29 @@ def __init__( layer_device_map=layer_device_map, ) - # make sure layer_device_map is none assert layer_device_map is None assert device is None or device == "cpu", "Device must be None or 'cpu'" self.cache_position = None - # Create a list of cache instances, one per layer - # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers + # Create a list of cache instances, one per layer. + # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers. self.kv_cache = torch.nn.ModuleList() - for layer_idx in range(config.num_hidden_layers): - # newer version of transfomer has is_sliding defined - # for HybridCache - if self.is_sliding[layer_idx]: + for layer in self.layers: + if layer.is_sliding: # This is a sliding window layer layer_cache = CustomRingKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.sliding_window_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) else: layer_cache = CustomKVCache( - max_batch_size=self.max_batch_size, - max_context_length=self.max_cache_len, - n_heads=self.num_key_value_heads, - head_dim=self.head_dim, + max_batch_size=layer.max_batch_size, + max_context_length=layer.max_cache_len, + n_heads=layer.num_heads, + head_dim=layer.head_dim, dtype=dtype, ) self.kv_cache.append(layer_cache) @@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: # For CustomRingKVCache, we need to handle the sequence length differently layer_cache = self.kv_cache[layer_idx] - if self.is_sliding[layer_idx]: + if self.layers[layer_idx].is_sliding: # CustomRingKVCache cache_position_manager which # maintains cache position for each slot in the kv cache # we return the max position + 1 to indicate max position @@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int): def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype): """ - Replace all KV caches in the module with ETCustomStaticCache. + Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache. This modifies the model in place. Args: @@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(static_cache) else: module.static_cache = ETCustomStaticCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Dont know why we need to this even though @@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt if getattr(module, "replace_cache", None) is not None: hybrid_cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) module.replace_cache(hybrid_cache) else: module.cache = ETCustomHybridCache( config=config, - max_batch_size=generation_config.cache_config.batch_size, - max_cache_len=generation_config.cache_config.max_cache_len, - device=generation_config.cache_config.device, + max_batch_size=generation_config.cache_config.get("batch_size"), + max_cache_len=generation_config.cache_config.get("max_cache_len"), + device=generation_config.cache_config.get("device"), dtype=cache_dtype, ) # Register cache attributes for each layer for i in range(len(module.cache.kv_cache)): setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache) setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache) - if module.cache.is_sliding[i]: + if module.cache.layers[i].is_sliding: # Register cache_positions as buffer for sliding window layers # This prevents it from being traced as a constant module.register_buffer( diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index 28ffc2fd..58218e51 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -16,6 +16,7 @@ import logging import os +import shutil from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -24,6 +25,7 @@ import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa from transformers import ( AutoModelForCausalLM, AutoModelForImageClassification, @@ -102,6 +104,34 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon self.stats = Stats() + # Initialize cleanup tracking + self._temp_dir = None + + def __del__(self): + """Clean up temporary files when the model instance is destroyed.""" + self._cleanup_temp_resources() + + def _cleanup_temp_resources(self): + """Clean up temporary directory and files.""" + if hasattr(self, "_temp_dir") and self._temp_dir is not None: + try: + if hasattr(self._temp_dir, "cleanup"): + # It's a TemporaryDirectory object + logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}") + self._temp_dir.cleanup() + logging.info("Temporary directory cleanup completed") + elif isinstance(self._temp_dir, (str, Path)): + # It's a path + logging.info(f"Cleaning up temporary path: {self._temp_dir}") + shutil.rmtree(self._temp_dir, ignore_errors=True) + logging.info("Temporary path cleanup completed") + except Exception as e: + # Log cleanup errors for debugging + logging.warning(f"Error during temp directory cleanup: {e}") + pass + finally: + self._temp_dir = None + @abstractmethod def forward(self, *args, **kwargs): """ @@ -242,7 +272,7 @@ def _export( inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class) logging.info(f"Inferred task from model class: {inferred_task}") - save_dir = TemporaryDirectory() + save_dir = TemporaryDirectory(prefix="executorch_export_") save_dir_path = Path(save_dir.name) # Export to ExecuTorch and save the pte file to the temporary directory @@ -266,7 +296,7 @@ def _export( for name, _ in executorch_progs.items(): models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config)) - return models + return models, save_dir def _save_pretrained(self, save_directory): """ @@ -298,6 +328,7 @@ def from_pretrained( logger.info("Offline mode: setting `local_files_only=True`") local_files_only = True + # See if model was already exported to ExecuTorch and uplaoded to the HuggingFace repo. _export = export try: if local_files_only and not os.path.isdir(model_id): @@ -324,21 +355,21 @@ def from_pretrained( if export: logger.warning( f"The model {model_id} was already converted to the ExecuTorch IR but got `export=True`, the model will be converted to ExecuTorch once again. " - # "Don't forget to save the resulting model with `.save_pretrained()`" ) _export = True else: logger.warning( f"No ExecuTorch files were found for {model_id}, setting `export=True` to convert the model to the ExecuTorch IR. " - # "Don't forget to save the resulting model with `.save_pretrained()`" ) except Exception as exception: logger.warning( f"Could not infer whether the model was already converted or not to the ExecuTorch IR, keeping `export={export}`.\n{exception}" ) + temp_dir = None if _export: - models_dict = cls._export( + logging.info(f"Exporting {model_id} to ExecuTorch program...") + models_dict, temp_dir = cls._export( model_id=model_id, config=config, revision=revision, @@ -351,6 +382,9 @@ def from_pretrained( **kwargs, ) else: + logging.info( + f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export." + ) models_dict = {} for pte_file in pte_files: models_dict.update( @@ -368,7 +402,14 @@ def from_pretrained( ) ) - return cls(models_dict, config) + model_instance = cls(models_dict, config) + + # Store the TemporaryDirectory reference to prevent GC + if temp_dir is not None: + model_instance._temp_dir = temp_dir + logging.info(f"Stored temp directory reference in model: {temp_dir.name}") + + return model_instance class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase): diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py index 7a9fe9c8..854bf260 100644 --- a/optimum/exporters/executorch/__main__.py +++ b/optimum/exporters/executorch/__main__.py @@ -15,6 +15,7 @@ """Entry point to the optimum.exporters.executorch command line.""" import argparse +import logging import os import warnings from pathlib import Path @@ -130,10 +131,14 @@ def main_export( kwargs["force_download"] = force_download kwargs["config"] = config + # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram). + logging.info(f"Loading {model_name_or_path} and exporting to static graph...") recipe_kwargs = kwargs.pop("recipe_kwargs", {}) model = task_func(model_name_or_path, **kwargs) + # 2. Export to ExecuTorch through ExecuTorch's lowering APIs. + logging.info(f"Lowering {model_name_or_path} to ExecuTorch...") if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py index 612b07fb..8a381423 100644 --- a/optimum/exporters/executorch/convert.py +++ b/optimum/exporters/executorch/convert.py @@ -19,20 +19,17 @@ from pathlib import Path from typing import Union +from transformers.integrations.executorch import sdpa_mask_without_vmap +from transformers.masking_utils import AttentionMaskInterface from transformers.modeling_utils import AttentionInterface from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward -from optimum.utils.import_utils import is_transformers_version from .recipe_registry import discover_recipes, recipe_registry AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward) -if is_transformers_version(">=", "4.53.0.dev0"): - from transformers.integrations.executorch import sdpa_mask_without_vmap - from transformers.masking_utils import AttentionMaskInterface - - AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap) +AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap) def export_to_executorch( diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index cc2992b9..d6695902 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -29,7 +29,6 @@ from transformers.generation.configuration_utils import GenerationConfig from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache -from optimum.utils.import_utils import is_transformers_version from .utils import save_config_to_constant_methods @@ -72,11 +71,7 @@ def _prepare_export_inputs(self): and not (self.use_custom_kv_cache and self.use_custom_sdpa) ) - if ( - not self.disable_dynamic_shapes - and is_transformers_version(">", "4.52.0") - and not is_using_hybrid_cache_wo_custom_sdpa_kv_cache - ): + if not self.disable_dynamic_shapes and not is_using_hybrid_cache_wo_custom_sdpa_kv_cache: # Prepare inputs with dynamic shapes seq_length = 3 # Sequence length > 1 to avoid specialization issues example_input_ids = torch.zeros((1, seq_length), dtype=torch.long) @@ -93,24 +88,23 @@ def _prepare_export_inputs(self): return example_input_ids, example_cache_position, dynamic_shapes, strict - def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module): - if is_transformers_version(">=", "4.53.0.dev0"): - from transformers.integrations.executorch import sdpa_mask_without_vmap - from transformers.masking_utils import AttentionMaskInterface - from transformers.modeling_utils import AttentionInterface - - _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module) - if self.use_custom_sdpa: - if self.use_custom_kv_cache: - AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache) - AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap) - # Manually set the attention implementation to custom_sdpa_ring_kv_cache - # This handles both regular sdpa and one for sliding window/local attention - exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache" - else: - # Manually set the attention implementation to custom_sdpa_ring_kv_cache - # This handles both regular sdpa and one for sliding window/local attention - exportable_module.model.model.config._attn_implementation = "custom_sdpa" + def _register_custom_attention(self, exportable_module: torch.nn.Module): + from transformers.integrations.executorch import sdpa_mask_without_vmap + from transformers.masking_utils import AttentionMaskInterface + from transformers.modeling_utils import AttentionInterface + + _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module) + if self.use_custom_sdpa: + if self.use_custom_kv_cache: + AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache) + AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap) + # Manually set the attention implementation to custom_sdpa_ring_kv_cache + # This handles both regular sdpa and one for sliding window/local attention + exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache" + else: + # Manually set the attention implementation to custom_sdpa_ring_kv_cache + # This handles both regular sdpa and one for sliding window/local attention + exportable_module.model.model.config._attn_implementation = "custom_sdpa" def export( self, @@ -119,55 +113,48 @@ def export( logging.info( f"Exporting using input_ids({input_ids.shape})={input_ids}, cache_position({cache_position.shape})={cache_position}, dynamic_shapes={dynamic_shapes}, strict={strict}" ) - if is_transformers_version(">", "4.52.0"): - from transformers.integrations.executorch import ( - TorchExportableModuleForDecoderOnlyLM, - ) - exportable_module = TorchExportableModuleForDecoderOnlyLM( - self.model, - max_batch_size=1, - max_cache_len=self.metadata.get("get_max_seq_len"), - ) - self._register_attention_mask_for_4_53(exportable_module) + from transformers.integrations.executorch import ( + TorchExportableModuleForDecoderOnlyLM, + ) - if self.use_custom_kv_cache: - from optimum.executorch.attentions.custom_kv_cache import ( - replace_with_et_custom_kv_cache, - ) + exportable_module = TorchExportableModuleForDecoderOnlyLM( + self.model, + max_batch_size=1, + max_cache_len=self.metadata.get("get_max_seq_len"), + ) + self._register_custom_attention(exportable_module) - replace_with_et_custom_kv_cache( - exportable_module.model, - self.model.config, - self.model.generation_config, - self.model.dtype, - ) + if self.use_custom_kv_cache: + from optimum.executorch.attentions.custom_kv_cache import ( + replace_with_et_custom_kv_cache, + ) - with torch.no_grad(): - exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict) - # Apply RemoveTransposes pass to remove - # any back-to-back transpose ops that are not needed - # e.g. output of update_cache is transposed and - # input to custom_sdpa is transposed. - from executorch.extension.llm.export.export_passes import ( - RemoveRedundantTransposes, - ) + replace_with_et_custom_kv_cache( + exportable_module.model, + self.model.config, + self.model.generation_config, + self.model.dtype, + ) - mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0] - exported_program = torch.export.export( - mutated_gm, - args=(input_ids, cache_position), - kwargs={}, - dynamic_shapes=dynamic_shapes, - strict=strict, - ) - else: - # Path to use legacy API, static export only due to pinned transformers version - from transformers.integrations.executorch import ( - convert_and_export_with_cache, + with torch.no_grad(): + exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict) + # Apply RemoveTransposes pass to remove + # any back-to-back transpose ops that are not needed + # e.g. output of update_cache is transposed and + # input to custom_sdpa is transposed. + from executorch.extension.llm.export.export_passes import ( + RemoveRedundantTransposes, ) - exported_program = convert_and_export_with_cache(self.model, input_ids, cache_position) + mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0] + exported_program = torch.export.export( + mutated_gm, + args=(input_ids, cache_position), + kwargs={}, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) return {"model": exported_program} @@ -400,8 +387,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi wrapped_decoder = ( Seq2SeqLMDecoderExportableModuleWithStaticCache( model=self.full_model, - max_static_cache_length=self.generation_config.cache_config.max_cache_len, - batch_size=self.generation_config.cache_config.batch_size, + max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"), + batch_size=self.generation_config.cache_config.get("batch_size"), ) .to("cpu") .eval() diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py index c2e2028f..8994fd41 100644 --- a/optimum/exporters/executorch/quantization.py +++ b/optimum/exporters/executorch/quantization.py @@ -16,8 +16,6 @@ from typing import Optional import torch -import torchao -from packaging.version import parse def quantize_model_( @@ -26,10 +24,6 @@ def quantize_model_( if not (qlinear_config or qembedding_config): return - # TODO: Update torchao to use 0.11.0 once released - if parse(torchao.__version__) < parse("0.11.0.dev0"): - raise RuntimeError("Quantization requires torchao >= 0.11.0. Please upgrade torchao.") - from torchao.quantization.granularity import PerAxis, PerGroup from torchao.quantization.quant_api import ( Int8DynamicActivationIntxWeightConfig, diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py index 70447957..e805ad4e 100644 --- a/optimum/exporters/executorch/utils.py +++ b/optimum/exporters/executorch/utils.py @@ -53,8 +53,8 @@ def save_config_to_constant_methods( # Check for cache_config and its attributes cache_config = getattr(generation_config, "cache_config", None) if cache_config is not None: - max_batch_size = getattr(cache_config, "batch_size", None) - max_seq_len = getattr(cache_config, "max_cache_len", None) + max_batch_size = cache_config.get("batch_size") + max_seq_len = cache_config.get("max_cache_len") if max_batch_size is not None: metadata["get_max_batch_size"] = max_batch_size diff --git a/setup.py b/setup.py index c7fa93ed..7d447fa6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ INSTALL_REQUIRE = [ "optimum~=1.24", "executorch>=0.6.0", - "transformers==4.51.3", + "transformers==4.54.1", ] TESTS_REQUIRE = [ diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py index 56666c04..11e91174 100644 --- a/tests/models/test_modeling_gemma3.py +++ b/tests/models/test_modeling_gemma3.py @@ -22,15 +22,11 @@ import unittest import pytest -import torchao -import transformers from executorch.extension.pybindings.portable_lib import ExecuTorchModule -from packaging.version import parse from transformers import AutoTokenizer from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForCausalLM -from optimum.utils.import_utils import is_transformers_version from ..utils import check_causal_lm_output_quality @@ -41,10 +37,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" -@pytest.mark.skipif( - is_transformers_version("<", "4.52.0.dev0"), - reason="Only available on transformers >= 4.52.0.dev0", -) class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -119,10 +111,6 @@ def test_gemma3_text_generation_portable(self): @slow @pytest.mark.run_slow @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner") - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" @@ -191,10 +179,6 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" @@ -230,10 +214,6 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0", - ) def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self): # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI # model_id = "google/gemma-3-1b-it" diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py index b0851444..fa1c9be6 100644 --- a/tests/models/test_modeling_phi4.py +++ b/tests/models/test_modeling_phi4.py @@ -21,7 +21,6 @@ import pytest import torchao -import transformers from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse from transformers import AutoConfig, AutoTokenizer @@ -43,12 +42,6 @@ def __init__(self, *args, **kwargs): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - is_linux_ci - or parse(transformers.__version__) < parse("4.52.0") - or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0. OOM on linux runner.", - ) def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "microsoft/Phi-4-mini-instruct" model = ExecuTorchModelForCausalLM.from_pretrained( diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py index 0f0d8a36..4deacac8 100644 --- a/tests/models/test_modeling_qwen3.py +++ b/tests/models/test_modeling_qwen3.py @@ -23,7 +23,6 @@ import pytest import torchao -import transformers from executorch import version from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse @@ -214,10 +213,6 @@ def test_qwen3_text_generation_with_custom_sdpa_8da4w_8we(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0.dev0"), - reason="Only available on transformers >= 4.52.0.dev0", - ) def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self): model_id = "Qwen/Qwen3-0.6B" prompt = "Give me a short introduction to large language model." @@ -249,10 +244,6 @@ def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0", - ) def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "Qwen/Qwen3-0.6B" prompt = "Give me a short introduction to large language model." diff --git a/tests/models/test_modeling_qwen3_embedding.py b/tests/models/test_modeling_qwen3_embedding.py index 73002e9c..0146634f 100644 --- a/tests/models/test_modeling_qwen3_embedding.py +++ b/tests/models/test_modeling_qwen3_embedding.py @@ -19,8 +19,6 @@ import unittest import pytest -import torchao -import transformers from executorch import version from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse @@ -41,10 +39,6 @@ def __init__(self, *args, **kwargs): @slow @pytest.mark.run_slow - @pytest.mark.skipif( - parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"), - reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0", - ) def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): model_id = "Qwen/Qwen3-Embedding-0.6B" prompt = "Explain gravity" diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py index 34a63510..1f3e26a2 100644 --- a/tests/models/test_modeling_smollm3.py +++ b/tests/models/test_modeling_smollm3.py @@ -25,7 +25,6 @@ from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForCausalLM -from optimum.utils.import_utils import is_transformers_version from ..utils import check_causal_lm_output_quality @@ -35,10 +34,7 @@ is_linux_ci = sys.platform.startswith("linux") and is_ci -@pytest.mark.skipif( - is_transformers_version("<", "4.53.1") or is_linux_ci, - reason="Only available on transformers >= 4.53.1", -) +@pytest.mark.skipif(is_linux_ci, reason="Runner OOM") class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -76,7 +72,7 @@ def test_smollm3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): @slow @pytest.mark.run_slow @pytest.mark.portable - @pytest.mark.skipif(is_ci, reason="Too big for CI runners") + @pytest.mark.skipif(is_ci, reason="Runner OOM") def test_smollm3_text_generation_portable(self): model_id = "HuggingFaceTB/SmolLM3-3B" prompt = "Give me a brief explanation of gravity in simple terms." diff --git a/tests/models/test_modeling_whisper.py b/tests/models/test_modeling_whisper.py index 3ecfc1c7..e88cb80f 100644 --- a/tests/models/test_modeling_whisper.py +++ b/tests/models/test_modeling_whisper.py @@ -28,16 +28,11 @@ from transformers.testing_utils import slow from optimum.executorch import ExecuTorchModelForSpeechSeq2Seq -from optimum.utils.import_utils import is_transformers_version os.environ["TOKENIZERS_PARALLELISM"] = "false" -@pytest.mark.skipif( - is_transformers_version(">", "4.52.4"), - reason="Need to fix in the transformers due to attention refactor https://github.com/huggingface/transformers/pull/38235", -) class ExecuTorchModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)