From 3a960bb88d2ba76205a27ba975019f1256ae48f0 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 31 Jul 2025 17:46:41 -0700
Subject: [PATCH 01/31] Bump transformers to 4.54.1

---
 install_dev.py                                |  2 +-
 .../executorch/attentions/custom_kv_cache.py  | 67 +++++++++----------
 optimum/exporters/executorch/integrations.py  |  4 +-
 optimum/exporters/executorch/utils.py         |  4 +-
 setup.py                                      |  2 +-
 5 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 5cf313ff..14012554 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.53.1
+            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index b5416f87..ab7e485e 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
+        for layer in self.layers:
             layer_cache = CustomKVCache(
-                max_batch_size=self.max_batch_size,
-                max_context_length=self.max_cache_len,
-                n_heads=self.num_key_value_heads,
-                head_dim=self.head_dim,
+                max_batch_size=layer.max_batch_size,
+                max_context_length=layer.max_cache_len,
+                n_heads=layer.num_heads,
+                head_dim=layer.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,32 +202,29 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
-        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        # Create a list of cache instances, one per layer.
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
-        for layer_idx in range(config.num_hidden_layers):
-            # newer version of transfomer has is_sliding defined
-            # for HybridCache
-            if self.is_sliding[layer_idx]:
+        for layer in self.layers:
+            if layer.is_sliding():
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.sliding_window_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.max_cache_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.is_sliding[layer_idx]:
+        if self.layers[layer_idx].is_sliding():
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache.
+    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
     This modifies the model in place.
 
     Args:
@@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding[i]:
+                if module.cache.layers[i].is_sliding():
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 23e6819a..06522c27 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -395,8 +395,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
-                batch_size=self.generation_config.cache_config.batch_size,
+                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
+                batch_size=self.generation_config.cache_config.get("batch_size"),
             )
             .to("cpu")
             .eval()
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
index 70447957..e805ad4e 100644
--- a/optimum/exporters/executorch/utils.py
+++ b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = getattr(cache_config, "batch_size", None)
-            max_seq_len = getattr(cache_config, "max_cache_len", None)
+            max_batch_size = cache_config.get("batch_size")
+            max_seq_len = cache_config.get("max_cache_len")
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size
diff --git a/setup.py b/setup.py
index c7fa93ed..7d447fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.51.3",
+    "transformers==4.54.1",
 ]
 
 TESTS_REQUIRE = [

From 3d223a2cc232ca367f2a42039837261d6c3c5c12 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 31 Jul 2025 18:08:19 -0700
Subject: [PATCH 02/31] Bump torch

---
 install_dev.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 14012554..87564c3d 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -6,9 +6,9 @@
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
     EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
-    TORCHAO_NIGHTLY_VERSION = "dev20250620"
+    TORCHAO_NIGHTLY_VERSION = "dev20250710"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
-    TORCH_NIGHTLY_VERSION = "dev20250601"
+    TORCH_NIGHTLY_VERSION = "dev20250716"
     subprocess.check_call(
         [
             sys.executable,
@@ -16,8 +16,8 @@ def install_torch_nightly_deps():
             "pip",
             "install",
             f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
-            f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}",
+            f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",
             f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}",
             "--extra-index-url",

From 207f8b1c787efb878aae00973320503437b459f5 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:00:29 -0700
Subject: [PATCH 03/31] Fix no module found error for custom_kv_cache

---
 optimum/exporters/executorch/integrations.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 06522c27..35406190 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -28,6 +28,7 @@
 )
 from transformers.generation.configuration_utils import GenerationConfig
 
+from executorch import version as executorch_version
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 from optimum.utils.import_utils import is_transformers_version
 
@@ -89,7 +90,10 @@ def _prepare_export_inputs(self):
         return example_input_ids, example_cache_position, dynamic_shapes, strict
 
     def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
-        if is_transformers_version(">=", "4.53.0.dev0"):
+        if (
+            is_transformers_version(">=", "4.53.0.dev0")
+            and parse(executorch_version.__version__).base_version > "0.6.0"
+        ):
             from transformers.integrations.executorch import sdpa_mask_without_vmap
             from transformers.masking_utils import AttentionMaskInterface
             from transformers.modeling_utils import AttentionInterface
@@ -126,7 +130,7 @@ def export(
             )
             self._register_attention_mask_for_4_53(exportable_module)
 
-            if self.use_custom_kv_cache:
+            if self.use_custom_kv_cache and parse(executorch_version.__version__).base_version > "0.6.0":
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
                 )

From bc828412787ced35d92f97c5fb402905d04d8e76 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:07:11 -0700
Subject: [PATCH 04/31] Try to fix Missing operator: [8]
 quantized_decomposed::embedding_byte.out

---
 optimum/executorch/modeling.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 28ffc2fd..710b3403 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -24,6 +24,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageClassification,

From 35fc91863f732546b2fbb669c80c7fa30d4bb29e Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 13:19:37 -0700
Subject: [PATCH 05/31] Fix quantization requires torchao >= 0.11.0

---
 optimum/exporters/executorch/quantization.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
index c2e2028f..299a2ddc 100644
--- a/optimum/exporters/executorch/quantization.py
+++ b/optimum/exporters/executorch/quantization.py
@@ -26,10 +26,6 @@ def quantize_model_(
     if not (qlinear_config or qembedding_config):
         return
 
-    # TODO: Update torchao to use 0.11.0 once released
-    if parse(torchao.__version__) < parse("0.11.0.dev0"):
-        raise RuntimeError("Quantization requires torchao >= 0.11.0. Please upgrade torchao.")
-
     from torchao.quantization.granularity import PerAxis, PerGroup
     from torchao.quantization.quant_api import (
         Int8DynamicActivationIntxWeightConfig,

From 6a26464ae6448c690d0c9b20350027b21b431728 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 17:09:51 -0700
Subject: [PATCH 06/31] Fix sliding window, print loaded ops

---
 optimum/executorch/attentions/custom_kv_cache.py | 6 +++---
 optimum/executorch/modeling.py                   | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index ab7e485e..a9d912ab 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -210,7 +210,7 @@ def __init__(
         # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
         for layer in self.layers:
-            if layer.is_sliding():
+            if layer.is_sliding:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
                     max_batch_size=layer.max_batch_size,
@@ -281,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.layers[layer_idx].is_sliding():
+        if self.layers[layer_idx].is_sliding:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -385,7 +385,7 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.layers[i].is_sliding():
+                if module.cache.layers[i].is_sliding:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 710b3403..03cb7593 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -186,6 +186,13 @@ def _from_pretrained(
             subfolder=subfolder,
             local_files_only=local_files_only,
         )
+
+        from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
+        from executorch.kernels import quantized  # noqa
+        from executorch.extension.pybindings.portable_lib import _get_operator_names
+        print("----------- LOADED OPS ----------")
+        print('\n'.join(_get_operator_names()))
+        print("---------------------------------")
         model = _load_for_executorch(model_cache_path)
         logging.info(
             f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"

From 4d68263c367468c1b2db47c3cbc53731027efa35 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 18:33:23 -0700
Subject: [PATCH 07/31] Bump ET nightly pin, fixes missing quantized ops

---
 install_dev.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 87564c3d..096e3774 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
     TORCHAO_NIGHTLY_VERSION = "dev20250710"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250716"
@@ -15,7 +15,7 @@ def install_torch_nightly_deps():
             "-m",
             "pip",
             "install",
-            f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
             f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",

From 6a3e1d4a074553443069fd7d8b28ad1324d9671b Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 4 Aug 2025 20:55:09 -0700
Subject: [PATCH 08/31] Fix no Q_ANNOTATION_KEY

---
 install_dev.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 096e3774..3fac4546 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -6,9 +6,9 @@
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
     EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
-    TORCHAO_NIGHTLY_VERSION = "dev20250710"
+    TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
-    TORCH_NIGHTLY_VERSION = "dev20250716"
+    TORCH_NIGHTLY_VERSION = "dev20250725"
     subprocess.check_call(
         [
             sys.executable,
@@ -19,7 +19,7 @@ def install_torch_nightly_deps():
             f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}",
+            f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}",
             "--extra-index-url",
             "https://download.pytorch.org/whl/nightly/cpu",
         ]

From 2b5fe7ec0993742d4714cf06ffa2822496514e8b Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 8 Aug 2025 14:32:55 -0700
Subject: [PATCH 09/31] Try to fix segfault/bus error by holding onto temp dir

---
 optimum/executorch/modeling.py | 73 ++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 03cb7593..7b1c4c62 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -16,6 +16,8 @@
 
 import logging
 import os
+import tempfile
+import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -102,6 +104,34 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
                 setattr(self, key, value)
 
         self.stats = Stats()
+        
+        # Initialize cleanup tracking
+        self._temp_dir = None
+
+    def __del__(self):
+        """Clean up temporary files when the model instance is destroyed."""
+        self._cleanup_temp_resources()
+    
+    def _cleanup_temp_resources(self):
+        """Clean up temporary directory and files."""
+        if hasattr(self, '_temp_dir') and self._temp_dir is not None:
+            try:
+                if hasattr(self._temp_dir, 'cleanup'):
+                    # It's a TemporaryDirectory object
+                    logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}")
+                    self._temp_dir.cleanup()
+                    logging.info(f"Temporary directory cleanup completed")
+                elif isinstance(self._temp_dir, (str, Path)):
+                    # It's a path
+                    logging.info(f"Cleaning up temporary path: {self._temp_dir}")
+                    shutil.rmtree(self._temp_dir, ignore_errors=True)
+                    logging.info(f"Temporary path cleanup completed")
+            except Exception as e:
+                # Log cleanup errors for debugging
+                logging.warning(f"Error during temp directory cleanup: {e}")
+                pass
+            finally:
+                self._temp_dir = None
 
     @abstractmethod
     def forward(self, *args, **kwargs):
@@ -250,7 +280,7 @@ def _export(
         inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class)
         logging.info(f"Inferred task from model class: {inferred_task}")
 
-        save_dir = TemporaryDirectory()
+        save_dir = TemporaryDirectory(prefix="executorch_export_")
         save_dir_path = Path(save_dir.name)
 
         # Export to ExecuTorch and save the pte file to the temporary directory
@@ -274,7 +304,16 @@ def _export(
         for name, _ in executorch_progs.items():
             models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config))
 
-        return models
+        # Log temp directory info for debugging
+        logging.info(f"Created temporary directory: {save_dir_path}")
+        for name in executorch_progs.keys():
+            pte_file = save_dir_path / f"{name}.pte"
+            if pte_file.exists():
+                logging.info(f"PTE file exists at export: {pte_file} (size: {pte_file.stat().st_size} bytes)")
+            else:
+                logging.warning(f"PTE file missing at export: {pte_file}")
+        
+        return models, save_dir
 
     def _save_pretrained(self, save_directory):
         """
@@ -345,8 +384,9 @@ def from_pretrained(
                 f"Could not infer whether the model was already converted or not to the ExecuTorch IR, keeping `export={export}`.\n{exception}"
             )
 
+        temp_dir = None
         if _export:
-            models_dict = cls._export(
+            models_dict, temp_dir = cls._export(
                 model_id=model_id,
                 config=config,
                 revision=revision,
@@ -376,7 +416,14 @@ def from_pretrained(
                     )
                 )
 
-        return cls(models_dict, config)
+        model_instance = cls(models_dict, config)
+        
+        # Store the TemporaryDirectory reference to prevent GC
+        if temp_dir is not None:
+            model_instance._temp_dir = temp_dir
+            logging.info(f"Stored temp directory reference in model: {temp_dir.name}")
+            
+        return model_instance
 
 
 class ExecuTorchModelForSeq2SeqLM(ExecuTorchModelBase):
@@ -647,12 +694,30 @@ def forward(
         Returns:
             torch.Tensor: Logits output from the model.
         """
+        # Check if temp directory and PTE file still exist before forward pass
+        if hasattr(self, '_temp_dir') and self._temp_dir is not None:
+            temp_path = Path(self._temp_dir.name)
+            logging.info(f"Forward pass - temp directory exists: {temp_path.exists()}")
+            if temp_path.exists():
+                pte_files = list(temp_path.glob("*.pte"))
+                logging.info(f"Forward pass - PTE files found: {len(pte_files)}")
+                for pte_file in pte_files:
+                    logging.info(f"Forward pass - PTE file: {pte_file} exists: {pte_file.exists()}, size: {pte_file.stat().st_size if pte_file.exists() else 'N/A'}")
+            else:
+                logging.error(f"Forward pass - temp directory missing: {temp_path}")
+        else:
+            logging.info("Forward pass - no temp directory reference stored")
+
         self.stats.on_model_execution_start()
 
         try:
             logits = self.model.forward((input_ids, cache_position))[0]
         except Exception as e:
             shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")}
+            logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}")
+            if hasattr(self, '_temp_dir') and self._temp_dir is not None:
+                temp_path = Path(self._temp_dir.name)
+                logging.error(f"Forward pass failed - temp directory: {temp_path} exists: {temp_path.exists()}")
             print(f"Exception: {e}.\n{self.model.method_meta('forward')}\narg shapes: {shapes}")
             raise
 

From bb0089cd34ab859c1467d0d8b12b702b1b6f9c2c Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 8 Aug 2025 19:03:06 -0700
Subject: [PATCH 10/31] Bigger mac runners

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index af830a02..bf83331e 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -36,7 +36,7 @@ jobs:
         test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
         executorch-version: ['0.6.0', 'nightly']
         python-version: ['3.11']
-        os: [macos-15, ubuntu-22.04]
+        os: [macos-15-large, ubuntu-22.04]
 
     # Custom job name, now shortened and cleaner
     name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }})

From 72802e3453d616d05d913f976c8561f95eae9ec0 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Sun, 10 Aug 2025 12:11:15 -0700
Subject: [PATCH 11/31] Revert "Bigger mac runners"

This reverts commit 27097140cd11a11a886dfb14846893b0c31608a3.
---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index bf83331e..af830a02 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -36,7 +36,7 @@ jobs:
         test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
         executorch-version: ['0.6.0', 'nightly']
         python-version: ['3.11']
-        os: [macos-15-large, ubuntu-22.04]
+        os: [macos-15, ubuntu-22.04]
 
     # Custom job name, now shortened and cleaner
     name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }})

From 9876c7e850ad816a044223d26dd6e2a40ed27219 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Sun, 10 Aug 2025 12:55:58 -0700
Subject: [PATCH 12/31] Add helpful logs

---
 optimum/executorch/modeling.py           | 7 +++++--
 optimum/exporters/executorch/__main__.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 7b1c4c62..55bfa477 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -345,6 +345,7 @@ def from_pretrained(
             logger.info("Offline mode: setting `local_files_only=True`")
             local_files_only = True
 
+        # See if model was already exported to ExecuTorch and uplaoded to the HuggingFace repo.
         _export = export
         try:
             if local_files_only and not os.path.isdir(model_id):
@@ -371,13 +372,11 @@ def from_pretrained(
                 if export:
                     logger.warning(
                         f"The model {model_id} was already converted to the ExecuTorch IR but got `export=True`, the model will be converted to ExecuTorch once again. "
-                        # "Don't forget to save the resulting model with `.save_pretrained()`"
                     )
                     _export = True
                 else:
                     logger.warning(
                         f"No ExecuTorch files were found for {model_id}, setting `export=True` to convert the model to the ExecuTorch IR. "
-                        # "Don't forget to save the resulting model with `.save_pretrained()`"
                     )
         except Exception as exception:
             logger.warning(
@@ -386,6 +385,7 @@ def from_pretrained(
 
         temp_dir = None
         if _export:
+            logging.info(f"Exporting {model_id} to ExecuTorch program...")
             models_dict, temp_dir = cls._export(
                 model_id=model_id,
                 config=config,
@@ -399,6 +399,7 @@ def from_pretrained(
                 **kwargs,
             )
         else:
+            logging.info(f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export.")
             models_dict = {}
             for pte_file in pte_files:
                 models_dict.update(
@@ -711,7 +712,9 @@ def forward(
         self.stats.on_model_execution_start()
 
         try:
+            logging.info("Running forward()...")
             logits = self.model.forward((input_ids, cache_position))[0]
+            logging.info(f"logits from forward(): {logits}")
         except Exception as e:
             shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")}
             logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}")
diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py
index 7a9fe9c8..d505af64 100644
--- a/optimum/exporters/executorch/__main__.py
+++ b/optimum/exporters/executorch/__main__.py
@@ -15,6 +15,7 @@
 """Entry point to the optimum.exporters.executorch command line."""
 
 import argparse
+import logging
 import os
 import warnings
 from pathlib import Path
@@ -130,13 +131,15 @@ def main_export(
     kwargs["force_download"] = force_download
     kwargs["config"] = config
 
+    # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram).
+    logging.info(f"Loading {model_name_or_path} and exporting to static graph...")
     recipe_kwargs = kwargs.pop("recipe_kwargs", {})
-
     model = task_func(model_name_or_path, **kwargs)
 
+    # 2. Export to ExecuTorch through ExecuTorch's lowering APIs.
+    logging.info(f"Lowering {model_name_or_path} to ExecuTorch...")
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-
     return export_to_executorch(
         model=model,
         task=task,

From 19f4d21bf780851c7b96db447c377118208b6a0e Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Sun, 10 Aug 2025 13:41:44 -0700
Subject: [PATCH 13/31] Re-enable smollm3 tests for linux

---
 tests/models/test_modeling_smollm3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py
index 34a63510..10dce8ee 100644
--- a/tests/models/test_modeling_smollm3.py
+++ b/tests/models/test_modeling_smollm3.py
@@ -36,7 +36,7 @@
 
 
 @pytest.mark.skipif(
-    is_transformers_version("<", "4.53.1") or is_linux_ci,
+    is_transformers_version("<", "4.53.1"),
     reason="Only available on transformers >= 4.53.1",
 )
 class ExecuTorchModelIntegrationTest(unittest.TestCase):

From 99805f82be3461bb9bd269a29042c111f79ef079 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Sun, 10 Aug 2025 14:56:10 -0700
Subject: [PATCH 14/31] Experiment reverting transformers bump

---
 install_dev.py                                |  2 +-
 .../executorch/attentions/custom_kv_cache.py  | 67 ++++++++++---------
 optimum/exporters/executorch/integrations.py  |  4 +-
 optimum/exporters/executorch/utils.py         |  4 +-
 setup.py                                      |  2 +-
 5 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 3fac4546..486edb3c 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
+            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index a9d912ab..b5416f87 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for layer in self.layers:
+        for _ in range(config.num_hidden_layers):
             layer_cache = CustomKVCache(
-                max_batch_size=layer.max_batch_size,
-                max_context_length=layer.max_cache_len,
-                n_heads=layer.num_heads,
-                head_dim=layer.head_dim,
+                max_batch_size=self.max_batch_size,
+                max_context_length=self.max_cache_len,
+                n_heads=self.num_key_value_heads,
+                head_dim=self.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,29 +202,32 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
+        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer.
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
+        # Create a list of cache instances, one per layer
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
         self.kv_cache = torch.nn.ModuleList()
-        for layer in self.layers:
-            if layer.is_sliding:
+        for layer_idx in range(config.num_hidden_layers):
+            # newer version of transfomer has is_sliding defined
+            # for HybridCache
+            if self.is_sliding[layer_idx]:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=layer.max_batch_size,
-                    max_context_length=layer.max_cache_len,
-                    n_heads=layer.num_heads,
-                    head_dim=layer.head_dim,
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.sliding_window_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=layer.max_batch_size,
-                    max_context_length=layer.max_cache_len,
-                    n_heads=layer.num_heads,
-                    head_dim=layer.head_dim,
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.max_cache_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -281,7 +284,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.layers[layer_idx].is_sliding:
+        if self.is_sliding[layer_idx]:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -305,7 +308,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
+    Replace all KV caches in the module with ETCustomStaticCache.
     This modifies the model in place.
 
     Args:
@@ -339,18 +342,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -367,25 +370,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.layers[i].is_sliding:
+                if module.cache.is_sliding[i]:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 35406190..e6447562 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
-                batch_size=self.generation_config.cache_config.get("batch_size"),
+                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
+                batch_size=self.generation_config.cache_config.batch_size,
             )
             .to("cpu")
             .eval()
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
index e805ad4e..70447957 100644
--- a/optimum/exporters/executorch/utils.py
+++ b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = cache_config.get("batch_size")
-            max_seq_len = cache_config.get("max_cache_len")
+            max_batch_size = getattr(cache_config, "batch_size", None)
+            max_seq_len = getattr(cache_config, "max_cache_len", None)
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size
diff --git a/setup.py b/setup.py
index 7d447fa6..c7fa93ed 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.54.1",
+    "transformers==4.51.3",
 ]
 
 TESTS_REQUIRE = [

From 108ed173f63a028618021c9c928cbaa89aa45867 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 13 Aug 2025 13:56:04 -0700
Subject: [PATCH 15/31] Revert "Experiment reverting transformers bump"

This reverts commit 99805f82be3461bb9bd269a29042c111f79ef079.
---
 install_dev.py                                |  2 +-
 .../executorch/attentions/custom_kv_cache.py  | 67 +++++++++----------
 optimum/exporters/executorch/integrations.py  |  4 +-
 optimum/exporters/executorch/utils.py         |  4 +-
 setup.py                                      |  2 +-
 5 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 486edb3c..3fac4546 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.54.1
+            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index b5416f87..a9d912ab 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
+        for layer in self.layers:
             layer_cache = CustomKVCache(
-                max_batch_size=self.max_batch_size,
-                max_context_length=self.max_cache_len,
-                n_heads=self.num_key_value_heads,
-                head_dim=self.head_dim,
+                max_batch_size=layer.max_batch_size,
+                max_context_length=layer.max_cache_len,
+                n_heads=layer.num_heads,
+                head_dim=layer.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,32 +202,29 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
-        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        # Create a list of cache instances, one per layer.
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
-        for layer_idx in range(config.num_hidden_layers):
-            # newer version of transfomer has is_sliding defined
-            # for HybridCache
-            if self.is_sliding[layer_idx]:
+        for layer in self.layers:
+            if layer.is_sliding:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.sliding_window_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.max_cache_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.is_sliding[layer_idx]:
+        if self.layers[layer_idx].is_sliding:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache.
+    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
     This modifies the model in place.
 
     Args:
@@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding[i]:
+                if module.cache.layers[i].is_sliding:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index e6447562..35406190 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
-                batch_size=self.generation_config.cache_config.batch_size,
+                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
+                batch_size=self.generation_config.cache_config.get("batch_size"),
             )
             .to("cpu")
             .eval()
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
index 70447957..e805ad4e 100644
--- a/optimum/exporters/executorch/utils.py
+++ b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = getattr(cache_config, "batch_size", None)
-            max_seq_len = getattr(cache_config, "max_cache_len", None)
+            max_batch_size = cache_config.get("batch_size")
+            max_seq_len = cache_config.get("max_cache_len")
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size
diff --git a/setup.py b/setup.py
index c7fa93ed..7d447fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.51.3",
+    "transformers==4.54.1",
 ]
 
 TESTS_REQUIRE = [

From 59778ebe5c06ec0202a7ed3b0d7b6324a45a7274 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 13 Aug 2025 14:12:40 -0700
Subject: [PATCH 16/31] Formatting and remove logs

---
 optimum/executorch/modeling.py | 55 ++++++----------------------------
 1 file changed, 9 insertions(+), 46 deletions(-)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 55bfa477..80763533 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -104,19 +104,19 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
                 setattr(self, key, value)
 
         self.stats = Stats()
-        
+
         # Initialize cleanup tracking
         self._temp_dir = None
 
     def __del__(self):
         """Clean up temporary files when the model instance is destroyed."""
         self._cleanup_temp_resources()
-    
+
     def _cleanup_temp_resources(self):
         """Clean up temporary directory and files."""
-        if hasattr(self, '_temp_dir') and self._temp_dir is not None:
+        if hasattr(self, "_temp_dir") and self._temp_dir is not None:
             try:
-                if hasattr(self._temp_dir, 'cleanup'):
+                if hasattr(self._temp_dir, "cleanup"):
                     # It's a TemporaryDirectory object
                     logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}")
                     self._temp_dir.cleanup()
@@ -216,17 +216,7 @@ def _from_pretrained(
             subfolder=subfolder,
             local_files_only=local_files_only,
         )
-
-        from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
-        from executorch.kernels import quantized  # noqa
-        from executorch.extension.pybindings.portable_lib import _get_operator_names
-        print("----------- LOADED OPS ----------")
-        print('\n'.join(_get_operator_names()))
-        print("---------------------------------")
         model = _load_for_executorch(model_cache_path)
-        logging.info(
-            f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"
-        )
 
         return {default_file_name.removesuffix(_PTE_SUFFIX): model}
 
@@ -304,15 +294,6 @@ def _export(
         for name, _ in executorch_progs.items():
             models.update(cls._from_pretrained(save_dir_path, file_name=f"{name}.pte", config=config))
 
-        # Log temp directory info for debugging
-        logging.info(f"Created temporary directory: {save_dir_path}")
-        for name in executorch_progs.keys():
-            pte_file = save_dir_path / f"{name}.pte"
-            if pte_file.exists():
-                logging.info(f"PTE file exists at export: {pte_file} (size: {pte_file.stat().st_size} bytes)")
-            else:
-                logging.warning(f"PTE file missing at export: {pte_file}")
-        
         return models, save_dir
 
     def _save_pretrained(self, save_directory):
@@ -399,7 +380,9 @@ def from_pretrained(
                 **kwargs,
             )
         else:
-            logging.info(f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export.")
+            logging.info(
+                f"Pre-exported `.pte` artifact already exists in HuggingFace repo or provided file path for {model_id}, skipping export."
+            )
             models_dict = {}
             for pte_file in pte_files:
                 models_dict.update(
@@ -418,12 +401,12 @@ def from_pretrained(
                 )
 
         model_instance = cls(models_dict, config)
-        
+
         # Store the TemporaryDirectory reference to prevent GC
         if temp_dir is not None:
             model_instance._temp_dir = temp_dir
             logging.info(f"Stored temp directory reference in model: {temp_dir.name}")
-            
+
         return model_instance
 
 
@@ -695,32 +678,12 @@ def forward(
         Returns:
             torch.Tensor: Logits output from the model.
         """
-        # Check if temp directory and PTE file still exist before forward pass
-        if hasattr(self, '_temp_dir') and self._temp_dir is not None:
-            temp_path = Path(self._temp_dir.name)
-            logging.info(f"Forward pass - temp directory exists: {temp_path.exists()}")
-            if temp_path.exists():
-                pte_files = list(temp_path.glob("*.pte"))
-                logging.info(f"Forward pass - PTE files found: {len(pte_files)}")
-                for pte_file in pte_files:
-                    logging.info(f"Forward pass - PTE file: {pte_file} exists: {pte_file.exists()}, size: {pte_file.stat().st_size if pte_file.exists() else 'N/A'}")
-            else:
-                logging.error(f"Forward pass - temp directory missing: {temp_path}")
-        else:
-            logging.info("Forward pass - no temp directory reference stored")
-
         self.stats.on_model_execution_start()
 
         try:
-            logging.info("Running forward()...")
             logits = self.model.forward((input_ids, cache_position))[0]
-            logging.info(f"logits from forward(): {logits}")
         except Exception as e:
             shapes = {name: val.shape for name, val in locals().items() if hasattr(val, "shape")}
-            logging.error(f"Forward pass failed - temp dir exists: {hasattr(self, '_temp_dir') and self._temp_dir is not None}")
-            if hasattr(self, '_temp_dir') and self._temp_dir is not None:
-                temp_path = Path(self._temp_dir.name)
-                logging.error(f"Forward pass failed - temp directory: {temp_path} exists: {temp_path.exists()}")
             print(f"Exception: {e}.\n{self.model.method_meta('forward')}\narg shapes: {shapes}")
             raise
 

From ff8a2a1fab745987fc1798c47539fb2a855c19bf Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:25:15 -0700
Subject: [PATCH 17/31] Bump ET release from 0.6 -> 0.7

---
 .github/workflows/test_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index af830a02..13b61c12 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix:
         test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
-        executorch-version: ['0.6.0', 'nightly']
+        executorch-version: ['0.7.0', 'nightly']
         python-version: ['3.11']
         os: [macos-15, ubuntu-22.04]
 

From a3009ca979c41b2335cd358e5375538d52c6007a Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 13:46:57 -0700
Subject: [PATCH 18/31] Bisect down to ET 20250701

---
 install_dev.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install_dev.py b/install_dev.py
index 3fac4546..1f7a0bb7 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250701"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"

From ae488b1bd135d2e0c16a20a4da0aeaf0c1a8a147 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Sun, 10 Aug 2025 14:56:10 -0700
Subject: [PATCH 19/31] Experiment reverting transformers bump

---
 install_dev.py                                |  2 +-
 .../executorch/attentions/custom_kv_cache.py  | 67 ++++++++++---------
 optimum/exporters/executorch/integrations.py  |  4 +-
 optimum/exporters/executorch/utils.py         |  4 +-
 setup.py                                      |  2 +-
 5 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 1f7a0bb7..bdf54083 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
+            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index a9d912ab..b5416f87 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for layer in self.layers:
+        for _ in range(config.num_hidden_layers):
             layer_cache = CustomKVCache(
-                max_batch_size=layer.max_batch_size,
-                max_context_length=layer.max_cache_len,
-                n_heads=layer.num_heads,
-                head_dim=layer.head_dim,
+                max_batch_size=self.max_batch_size,
+                max_context_length=self.max_cache_len,
+                n_heads=self.num_key_value_heads,
+                head_dim=self.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,29 +202,32 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
+        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer.
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
+        # Create a list of cache instances, one per layer
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
         self.kv_cache = torch.nn.ModuleList()
-        for layer in self.layers:
-            if layer.is_sliding:
+        for layer_idx in range(config.num_hidden_layers):
+            # newer version of transfomer has is_sliding defined
+            # for HybridCache
+            if self.is_sliding[layer_idx]:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=layer.max_batch_size,
-                    max_context_length=layer.max_cache_len,
-                    n_heads=layer.num_heads,
-                    head_dim=layer.head_dim,
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.sliding_window_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=layer.max_batch_size,
-                    max_context_length=layer.max_cache_len,
-                    n_heads=layer.num_heads,
-                    head_dim=layer.head_dim,
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.max_cache_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -281,7 +284,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.layers[layer_idx].is_sliding:
+        if self.is_sliding[layer_idx]:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -305,7 +308,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
+    Replace all KV caches in the module with ETCustomStaticCache.
     This modifies the model in place.
 
     Args:
@@ -339,18 +342,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -367,25 +370,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.get("batch_size"),
-                max_cache_len=generation_config.cache_config.get("max_cache_len"),
-                device=generation_config.cache_config.get("device"),
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.layers[i].is_sliding:
+                if module.cache.is_sliding[i]:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 35406190..e6447562 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
-                batch_size=self.generation_config.cache_config.get("batch_size"),
+                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
+                batch_size=self.generation_config.cache_config.batch_size,
             )
             .to("cpu")
             .eval()
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
index e805ad4e..70447957 100644
--- a/optimum/exporters/executorch/utils.py
+++ b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = cache_config.get("batch_size")
-            max_seq_len = cache_config.get("max_cache_len")
+            max_batch_size = getattr(cache_config, "batch_size", None)
+            max_seq_len = getattr(cache_config, "max_cache_len", None)
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size
diff --git a/setup.py b/setup.py
index 7d447fa6..c7fa93ed 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.54.1",
+    "transformers==4.51.3",
 ]
 
 TESTS_REQUIRE = [

From b7a2fa1361cc2c66690608f4930d99ea94c30330 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:49:13 -0700
Subject: [PATCH 20/31] Clean

---
 optimum/executorch/modeling.py           | 3 +++
 optimum/exporters/executorch/__main__.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index 80763533..fee2f1a2 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -217,6 +217,9 @@ def _from_pretrained(
             local_files_only=local_files_only,
         )
         model = _load_for_executorch(model_cache_path)
+        logging.info(
+            f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"
+        )
 
         return {default_file_name.removesuffix(_PTE_SUFFIX): model}
 
diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py
index d505af64..854bf260 100644
--- a/optimum/exporters/executorch/__main__.py
+++ b/optimum/exporters/executorch/__main__.py
@@ -134,12 +134,14 @@ def main_export(
     # 1. Load model, apply source transformations, and torch.export() into a graph (ExportedProgram).
     logging.info(f"Loading {model_name_or_path} and exporting to static graph...")
     recipe_kwargs = kwargs.pop("recipe_kwargs", {})
+
     model = task_func(model_name_or_path, **kwargs)
 
     # 2. Export to ExecuTorch through ExecuTorch's lowering APIs.
     logging.info(f"Lowering {model_name_or_path} to ExecuTorch...")
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
+
     return export_to_executorch(
         model=model,
         task=task,

From 1e0a67124bafa6aff27689ad40c5ee302ff63d22 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:50:31 -0700
Subject: [PATCH 21/31] Bisect down to ET 20250628

---
 install_dev.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index bdf54083..4ce4fe2b 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250701"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250628"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"
@@ -15,7 +15,7 @@ def install_torch_nightly_deps():
             "-m",
             "pip",
             "install",
-            f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
             f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",

From 896f0da22f13a139cfd7f2a92b31ea6d8c13d20b Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:57:44 -0700
Subject: [PATCH 22/31] Bisect down to ET 20250626

---
 install_dev.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install_dev.py b/install_dev.py
index 4ce4fe2b..ae4c8a8b 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250628"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250626"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"

From abd641b44d2ffabab2858044f117d6d96169e34b Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:10:50 -0700
Subject: [PATCH 23/31] Revert "Bisect down to ET 20250626"

This reverts commit 896f0da22f13a139cfd7f2a92b31ea6d8c13d20b.
---
 install_dev.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install_dev.py b/install_dev.py
index ae4c8a8b..4ce4fe2b 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250626"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250628"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"

From 7f7f9c2c7862a9b49ab6c513eb84d47b27fc279f Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:11:05 -0700
Subject: [PATCH 24/31] Revert "Bisect down to ET 20250628"

This reverts commit 1e0a67124bafa6aff27689ad40c5ee302ff63d22.
---
 install_dev.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index 4ce4fe2b..bdf54083 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250628"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250701"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"
@@ -15,7 +15,7 @@ def install_torch_nightly_deps():
             "-m",
             "pip",
             "install",
-            f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
             f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",

From 5f8a56f4a10d97191ddaf300c0d26aa36e031f55 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:11:25 -0700
Subject: [PATCH 25/31] Revert "Experiment reverting transformers bump"

This reverts commit ae488b1bd135d2e0c16a20a4da0aeaf0c1a8a147.
---
 install_dev.py                                |  2 +-
 .../executorch/attentions/custom_kv_cache.py  | 67 +++++++++----------
 optimum/exporters/executorch/integrations.py  |  4 +-
 optimum/exporters/executorch/utils.py         |  4 +-
 setup.py                                      |  2 +-
 5 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index bdf54083..1f7a0bb7 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.54.1
+            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
index b5416f87..a9d912ab 100644
--- a/optimum/executorch/attentions/custom_kv_cache.py
+++ b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
+        for layer in self.layers:
             layer_cache = CustomKVCache(
-                max_batch_size=self.max_batch_size,
-                max_context_length=self.max_cache_len,
-                n_heads=self.num_key_value_heads,
-                head_dim=self.head_dim,
+                max_batch_size=layer.max_batch_size,
+                max_context_length=layer.max_cache_len,
+                n_heads=layer.num_heads,
+                head_dim=layer.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,32 +202,29 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
-        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        # Create a list of cache instances, one per layer.
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
-        for layer_idx in range(config.num_hidden_layers):
-            # newer version of transfomer has is_sliding defined
-            # for HybridCache
-            if self.is_sliding[layer_idx]:
+        for layer in self.layers:
+            if layer.is_sliding:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.sliding_window_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.max_cache_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.is_sliding[layer_idx]:
+        if self.layers[layer_idx].is_sliding:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache.
+    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
     This modifies the model in place.
 
     Args:
@@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding[i]:
+                if module.cache.layers[i].is_sliding:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index e6447562..35406190 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -399,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
-                batch_size=self.generation_config.cache_config.batch_size,
+                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
+                batch_size=self.generation_config.cache_config.get("batch_size"),
             )
             .to("cpu")
             .eval()
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
index 70447957..e805ad4e 100644
--- a/optimum/exporters/executorch/utils.py
+++ b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = getattr(cache_config, "batch_size", None)
-            max_seq_len = getattr(cache_config, "max_cache_len", None)
+            max_batch_size = cache_config.get("batch_size")
+            max_seq_len = cache_config.get("max_cache_len")
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size
diff --git a/setup.py b/setup.py
index c7fa93ed..7d447fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.51.3",
+    "transformers==4.54.1",
 ]
 
 TESTS_REQUIRE = [

From 92bc2ba3f0659c7681a3106b123b75a0017c92ce Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:11:36 -0700
Subject: [PATCH 26/31] Revert "Bisect down to ET 20250701"

This reverts commit a3009ca979c41b2335cd358e5375538d52c6007a.
---
 install_dev.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install_dev.py b/install_dev.py
index 1f7a0bb7..3fac4546 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250701"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
     TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250725"

From 4abb2eccf1e60267b84f1f2a956ace4e3b3eb222 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:43:21 -0700
Subject: [PATCH 27/31] Skip mac tests

---
 .github/workflows/test_models.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
index 13b61c12..a72dba55 100644
--- a/.github/workflows/test_models.yml
+++ b/.github/workflows/test_models.yml
@@ -36,7 +36,8 @@ jobs:
         test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }}
         executorch-version: ['0.7.0', 'nightly']
         python-version: ['3.11']
-        os: [macos-15, ubuntu-22.04]
+        # os: [macos-15, ubuntu-22.04]  # TODO(#122): Re-enable the mac tests after fixing seg fault.
+        os: [ubuntu-22.04]
 
     # Custom job name, now shortened and cleaner
     name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }})

From ad9b639461ae7e849e7b3b255557f0fa3f52476c Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:44:19 -0700
Subject: [PATCH 28/31] Remove unnecessary ET 0.6 guards

---
 optimum/exporters/executorch/integrations.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 35406190..532aa4d4 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -90,10 +90,7 @@ def _prepare_export_inputs(self):
         return example_input_ids, example_cache_position, dynamic_shapes, strict
 
     def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
-        if (
-            is_transformers_version(">=", "4.53.0.dev0")
-            and parse(executorch_version.__version__).base_version > "0.6.0"
-        ):
+        if is_transformers_version(">=", "4.53.0.dev0"):
             from transformers.integrations.executorch import sdpa_mask_without_vmap
             from transformers.masking_utils import AttentionMaskInterface
             from transformers.modeling_utils import AttentionInterface
@@ -130,7 +127,7 @@ def export(
             )
             self._register_attention_mask_for_4_53(exportable_module)
 
-            if self.use_custom_kv_cache and parse(executorch_version.__version__).base_version > "0.6.0":
+            if self.use_custom_kv_cache:
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
                 )

From b252038b4feec37e775fb96f7ba740f2a6726b07 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:22:27 -0700
Subject: [PATCH 29/31] Ruff format

---
 optimum/executorch/modeling.py               | 5 ++---
 optimum/exporters/executorch/integrations.py | 1 -
 optimum/exporters/executorch/quantization.py | 2 --
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
index fee2f1a2..58218e51 100644
--- a/optimum/executorch/modeling.py
+++ b/optimum/executorch/modeling.py
@@ -16,7 +16,6 @@
 
 import logging
 import os
-import tempfile
 import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -120,12 +119,12 @@ def _cleanup_temp_resources(self):
                     # It's a TemporaryDirectory object
                     logging.info(f"Cleaning up temporary directory: {self._temp_dir.name}")
                     self._temp_dir.cleanup()
-                    logging.info(f"Temporary directory cleanup completed")
+                    logging.info("Temporary directory cleanup completed")
                 elif isinstance(self._temp_dir, (str, Path)):
                     # It's a path
                     logging.info(f"Cleaning up temporary path: {self._temp_dir}")
                     shutil.rmtree(self._temp_dir, ignore_errors=True)
-                    logging.info(f"Temporary path cleanup completed")
+                    logging.info("Temporary path cleanup completed")
             except Exception as e:
                 # Log cleanup errors for debugging
                 logging.warning(f"Error during temp directory cleanup: {e}")
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 532aa4d4..06522c27 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -28,7 +28,6 @@
 )
 from transformers.generation.configuration_utils import GenerationConfig
 
-from executorch import version as executorch_version
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 from optimum.utils.import_utils import is_transformers_version
 
diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
index 299a2ddc..8994fd41 100644
--- a/optimum/exporters/executorch/quantization.py
+++ b/optimum/exporters/executorch/quantization.py
@@ -16,8 +16,6 @@
 from typing import Optional
 
 import torch
-import torchao
-from packaging.version import parse
 
 
 def quantize_model_(

From e1353103e2da61b1e70e3af634651ae527f1bad0 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 15 Aug 2025 11:19:36 -0700
Subject: [PATCH 30/31] Remove all transformers < 4.54 guards

---
 optimum/exporters/executorch/convert.py       |   9 +-
 optimum/exporters/executorch/integrations.py  | 115 ++++++++----------
 tests/models/test_modeling_gemma3.py          |  20 ---
 tests/models/test_modeling_phi4.py            |   7 --
 tests/models/test_modeling_qwen3.py           |   9 --
 tests/models/test_modeling_qwen3_embedding.py |   6 -
 tests/models/test_modeling_smollm3.py         |   6 +-
 tests/models/test_modeling_whisper.py         |   5 -
 8 files changed, 57 insertions(+), 120 deletions(-)

diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
index 612b07fb..8a381423 100644
--- a/optimum/exporters/executorch/convert.py
+++ b/optimum/exporters/executorch/convert.py
@@ -19,20 +19,17 @@
 from pathlib import Path
 from typing import Union
 
+from transformers.integrations.executorch import sdpa_mask_without_vmap
+from transformers.masking_utils import AttentionMaskInterface
 from transformers.modeling_utils import AttentionInterface
 
 from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
-from optimum.utils.import_utils import is_transformers_version
 
 from .recipe_registry import discover_recipes, recipe_registry
 
 
 AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
-if is_transformers_version(">=", "4.53.0.dev0"):
-    from transformers.integrations.executorch import sdpa_mask_without_vmap
-    from transformers.masking_utils import AttentionMaskInterface
-
-    AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
+AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
 
 
 def export_to_executorch(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
index 06522c27..954a95da 100644
--- a/optimum/exporters/executorch/integrations.py
+++ b/optimum/exporters/executorch/integrations.py
@@ -29,7 +29,6 @@
 from transformers.generation.configuration_utils import GenerationConfig
 
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
-from optimum.utils.import_utils import is_transformers_version
 
 from .utils import save_config_to_constant_methods
 
@@ -71,7 +70,7 @@ def _prepare_export_inputs(self):
             and not (self.use_custom_kv_cache and self.use_custom_sdpa)
         )
 
-        if is_transformers_version(">", "4.52.0") and not is_using_hybrid_cache_wo_custom_sdpa_kv_cache:
+        if not is_using_hybrid_cache_wo_custom_sdpa_kv_cache:
             # Prepare inputs with dynamic shapes
             seq_length = 3  # Sequence length > 1 to avoid specialization issues
             example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
@@ -88,24 +87,23 @@ def _prepare_export_inputs(self):
 
         return example_input_ids, example_cache_position, dynamic_shapes, strict
 
-    def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
-        if is_transformers_version(">=", "4.53.0.dev0"):
-            from transformers.integrations.executorch import sdpa_mask_without_vmap
-            from transformers.masking_utils import AttentionMaskInterface
-            from transformers.modeling_utils import AttentionInterface
-
-            _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
-            if self.use_custom_sdpa:
-                if self.use_custom_kv_cache:
-                    AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
-                    AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
-                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
-                    # This handles both regular sdpa and one for sliding window/local attention
-                    exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
-                else:
-                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
-                    # This handles both regular sdpa and one for sliding window/local attention
-                    exportable_module.model.model.config._attn_implementation = "custom_sdpa"
+    def _register_custom_attention(self, exportable_module: torch.nn.Module):
+        from transformers.integrations.executorch import sdpa_mask_without_vmap
+        from transformers.masking_utils import AttentionMaskInterface
+        from transformers.modeling_utils import AttentionInterface
+
+        _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
+        if self.use_custom_sdpa:
+            if self.use_custom_kv_cache:
+                AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
+                AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
+                # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                # This handles both regular sdpa and one for sliding window/local attention
+                exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
+            else:
+                # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                # This handles both regular sdpa and one for sliding window/local attention
+                exportable_module.model.model.config._attn_implementation = "custom_sdpa"
 
     def export(
         self,
@@ -114,55 +112,48 @@ def export(
         logging.info(
             f"Exporting using input_ids({input_ids.shape})={input_ids}, cache_position({cache_position.shape})={cache_position}, dynamic_shapes={dynamic_shapes}, strict={strict}"
         )
-        if is_transformers_version(">", "4.52.0"):
-            from transformers.integrations.executorch import (
-                TorchExportableModuleForDecoderOnlyLM,
-            )
 
-            exportable_module = TorchExportableModuleForDecoderOnlyLM(
-                self.model,
-                max_batch_size=1,
-                max_cache_len=self.metadata.get("get_max_seq_len"),
-            )
-            self._register_attention_mask_for_4_53(exportable_module)
+        from transformers.integrations.executorch import (
+            TorchExportableModuleForDecoderOnlyLM,
+        )
 
-            if self.use_custom_kv_cache:
-                from optimum.executorch.attentions.custom_kv_cache import (
-                    replace_with_et_custom_kv_cache,
-                )
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            self.model,
+            max_batch_size=1,
+            max_cache_len=self.metadata.get("get_max_seq_len"),
+        )
+        self._register_custom_attention(exportable_module)
 
-                replace_with_et_custom_kv_cache(
-                    exportable_module.model,
-                    self.model.config,
-                    self.model.generation_config,
-                    self.model.dtype,
-                )
+        if self.use_custom_kv_cache:
+            from optimum.executorch.attentions.custom_kv_cache import (
+                replace_with_et_custom_kv_cache,
+            )
 
-            with torch.no_grad():
-                exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict)
-                # Apply RemoveTransposes pass to remove
-                # any back-to-back transpose ops that are not needed
-                # e.g. output of update_cache is transposed and
-                # input to custom_sdpa is transposed.
-                from executorch.extension.llm.export.export_passes import (
-                    RemoveRedundantTransposes,
-                )
+            replace_with_et_custom_kv_cache(
+                exportable_module.model,
+                self.model.config,
+                self.model.generation_config,
+                self.model.dtype,
+            )
 
-                mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
-                exported_program = torch.export.export(
-                    mutated_gm,
-                    args=(input_ids, cache_position),
-                    kwargs={},
-                    dynamic_shapes=dynamic_shapes,
-                    strict=strict,
-                )
-        else:
-            # Path to use legacy API, static export only due to pinned transformers version
-            from transformers.integrations.executorch import (
-                convert_and_export_with_cache,
+        with torch.no_grad():
+            exported_program = exportable_module.export(input_ids, cache_position, dynamic_shapes, strict)
+            # Apply RemoveTransposes pass to remove
+            # any back-to-back transpose ops that are not needed
+            # e.g. output of update_cache is transposed and
+            # input to custom_sdpa is transposed.
+            from executorch.extension.llm.export.export_passes import (
+                RemoveRedundantTransposes,
             )
 
-            exported_program = convert_and_export_with_cache(self.model, input_ids, cache_position)
+            mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
+            exported_program = torch.export.export(
+                mutated_gm,
+                args=(input_ids, cache_position),
+                kwargs={},
+                dynamic_shapes=dynamic_shapes,
+                strict=strict,
+            )
 
         return {"model": exported_program}
 
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
index 56666c04..11e91174 100644
--- a/tests/models/test_modeling_gemma3.py
+++ b/tests/models/test_modeling_gemma3.py
@@ -22,15 +22,11 @@
 import unittest
 
 import pytest
-import torchao
-import transformers
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForCausalLM
-from optimum.utils.import_utils import is_transformers_version
 
 from ..utils import check_causal_lm_output_quality
 
@@ -41,10 +37,6 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
-@pytest.mark.skipif(
-    is_transformers_version("<", "4.52.0.dev0"),
-    reason="Only available on transformers >= 4.52.0.dev0",
-)
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -119,10 +111,6 @@ def test_gemma3_text_generation_portable(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner")
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
-    )
     def test_gemma3_text_generation_with_custom_sdpa(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
         # model_id = "google/gemma-3-1b-it"
@@ -191,10 +179,6 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
-    )
     def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
         # model_id = "google/gemma-3-1b-it"
@@ -230,10 +214,6 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
-    )
     def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
         # model_id = "google/gemma-3-1b-it"
diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py
index b0851444..fa1c9be6 100644
--- a/tests/models/test_modeling_phi4.py
+++ b/tests/models/test_modeling_phi4.py
@@ -21,7 +21,6 @@
 
 import pytest
 import torchao
-import transformers
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from packaging.version import parse
 from transformers import AutoConfig, AutoTokenizer
@@ -43,12 +42,6 @@ def __init__(self, *args, **kwargs):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        is_linux_ci
-        or parse(transformers.__version__) < parse("4.52.0")
-        or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0. OOM on linux runner.",
-    )
     def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
         model_id = "microsoft/Phi-4-mini-instruct"
         model = ExecuTorchModelForCausalLM.from_pretrained(
diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py
index 0f0d8a36..4deacac8 100644
--- a/tests/models/test_modeling_qwen3.py
+++ b/tests/models/test_modeling_qwen3.py
@@ -23,7 +23,6 @@
 
 import pytest
 import torchao
-import transformers
 from executorch import version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from packaging.version import parse
@@ -214,10 +213,6 @@ def test_qwen3_text_generation_with_custom_sdpa_8da4w_8we(self):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0.dev0"),
-        reason="Only available on transformers >= 4.52.0.dev0",
-    )
     def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self):
         model_id = "Qwen/Qwen3-0.6B"
         prompt = "Give me a short introduction to large language model."
@@ -249,10 +244,6 @@ def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache(self):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
-    )
     def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
         model_id = "Qwen/Qwen3-0.6B"
         prompt = "Give me a short introduction to large language model."
diff --git a/tests/models/test_modeling_qwen3_embedding.py b/tests/models/test_modeling_qwen3_embedding.py
index 73002e9c..0146634f 100644
--- a/tests/models/test_modeling_qwen3_embedding.py
+++ b/tests/models/test_modeling_qwen3_embedding.py
@@ -19,8 +19,6 @@
 import unittest
 
 import pytest
-import torchao
-import transformers
 from executorch import version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from packaging.version import parse
@@ -41,10 +39,6 @@ def __init__(self, *args, **kwargs):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
-    )
     def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
         model_id = "Qwen/Qwen3-Embedding-0.6B"
         prompt = "Explain gravity"
diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py
index 10dce8ee..12eb3c63 100644
--- a/tests/models/test_modeling_smollm3.py
+++ b/tests/models/test_modeling_smollm3.py
@@ -25,7 +25,6 @@
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForCausalLM
-from optimum.utils.import_utils import is_transformers_version
 
 from ..utils import check_causal_lm_output_quality
 
@@ -35,10 +34,7 @@
 is_linux_ci = sys.platform.startswith("linux") and is_ci
 
 
-@pytest.mark.skipif(
-    is_transformers_version("<", "4.53.1"),
-    reason="Only available on transformers >= 4.53.1",
-)
+@pytest.mark.skipif(is_linux_ci)
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/tests/models/test_modeling_whisper.py b/tests/models/test_modeling_whisper.py
index 3ecfc1c7..e88cb80f 100644
--- a/tests/models/test_modeling_whisper.py
+++ b/tests/models/test_modeling_whisper.py
@@ -28,16 +28,11 @@
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForSpeechSeq2Seq
-from optimum.utils.import_utils import is_transformers_version
 
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
-@pytest.mark.skipif(
-    is_transformers_version(">", "4.52.4"),
-    reason="Need to fix in the transformers due to attention refactor https://github.com/huggingface/transformers/pull/38235",
-)
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

From 70338e914822ef60373952d159508b53d986cda1 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:09:14 -0700
Subject: [PATCH 31/31] Format

---
 tests/models/test_modeling_smollm3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py
index 12eb3c63..1f3e26a2 100644
--- a/tests/models/test_modeling_smollm3.py
+++ b/tests/models/test_modeling_smollm3.py
@@ -34,7 +34,7 @@
 is_linux_ci = sys.platform.startswith("linux") and is_ci
 
 
-@pytest.mark.skipif(is_linux_ci)
+@pytest.mark.skipif(is_linux_ci, reason="Runner OOM")
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -72,7 +72,7 @@ def test_smollm3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.portable
-    @pytest.mark.skipif(is_ci, reason="Too big for CI runners")
+    @pytest.mark.skipif(is_ci, reason="Runner OOM")
     def test_smollm3_text_generation_portable(self):
         model_id = "HuggingFaceTB/SmolLM3-3B"
         prompt = "Give me a brief explanation of gravity in simple terms."