huggingface · jackzhxng · Aug 18, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 4, 2025
diff --git a/install_dev.py b/install_dev.py
@@ -5,21 +5,21 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
-    TORCHAO_NIGHTLY_VERSION = "dev20250620"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250730"
+    TORCHAO_NIGHTLY_VERSION = "dev20250730"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
-    TORCH_NIGHTLY_VERSION = "dev20250601"
+    TORCH_NIGHTLY_VERSION = "dev20250725"
     subprocess.check_call(
         [
             sys.executable,
             "-m",
             "pip",
             "install",
-            f"executorch==0.7.0.{EXECUTORCH_NIGHTLY_VERSION}",
-            f"torch==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchvision==0.23.0.{TORCH_NIGHTLY_VERSION}",
+            f"executorch==0.8.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"torch==2.9.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchvision==0.24.0.{TORCH_NIGHTLY_VERSION}",
             f"torchaudio==2.8.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchao==0.12.0.{TORCHAO_NIGHTLY_VERSION}",
+            f"torchao==0.13.0.{TORCHAO_NIGHTLY_VERSION}",
             "--extra-index-url",
             "https://download.pytorch.org/whl/nightly/cpu",
         ]
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@896e9cea1ade521b2648f4798218550f6c72190c#egg=transformers",  # 4.53.1
+            "git+https://github.com/huggingface/transformers@9c641dc16154964e5ffc0c13e9ec6aaffa295ed6#egg=transformers",  # 4.54.1
         ]
     )
     subprocess.check_call(

diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -54,12 +54,12 @@ def __init__(
 
         # Create a list of CustomKVCache instances, one per layer
         self.kv_cache = torch.nn.ModuleList()
-        for _ in range(config.num_hidden_layers):
+        for layer in self.layers:
             layer_cache = CustomKVCache(
-                max_batch_size=self.max_batch_size,
-                max_context_length=self.max_cache_len,
-                n_heads=self.num_key_value_heads,
-                head_dim=self.head_dim,
+                max_batch_size=layer.max_batch_size,
+                max_context_length=layer.max_cache_len,
+                n_heads=layer.num_heads,
+                head_dim=layer.head_dim,
                 dtype=dtype,
             )
             self.kv_cache.append(layer_cache)
@@ -202,32 +202,29 @@ def __init__(
             layer_device_map=layer_device_map,
         )
 
-        # make sure layer_device_map is none
         assert layer_device_map is None
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         self.cache_position = None
-        # Create a list of cache instances, one per layer
-        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        # Create a list of cache instances, one per layer.
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
         self.kv_cache = torch.nn.ModuleList()
-        for layer_idx in range(config.num_hidden_layers):
-            # newer version of transfomer has is_sliding defined
-            # for HybridCache
-            if self.is_sliding[layer_idx]:
+        for layer in self.layers:
+            if layer.is_sliding:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.sliding_window_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             else:
                 layer_cache = CustomKVCache(
-                    max_batch_size=self.max_batch_size,
-                    max_context_length=self.max_cache_len,
-                    n_heads=self.num_key_value_heads,
-                    head_dim=self.head_dim,
+                    max_batch_size=layer.max_batch_size,
+                    max_context_length=layer.max_cache_len,
+                    n_heads=layer.num_heads,
+                    head_dim=layer.head_dim,
                     dtype=dtype,
                 )
             self.kv_cache.append(layer_cache)
@@ -284,7 +281,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
 
         # For CustomRingKVCache, we need to handle the sequence length differently
         layer_cache = self.kv_cache[layer_idx]
-        if self.is_sliding[layer_idx]:
+        if self.layers[layer_idx].is_sliding:
             # CustomRingKVCache cache_position_manager which
             # maintains cache position for each slot in the kv cache
             # we return the max position + 1 to indicate max position
@@ -308,7 +305,7 @@ def get_layer_cache(self, layer_idx: int):
 
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
-    Replace all KV caches in the module with ETCustomStaticCache.
+    Replace all KV caches in the module with ETCustomStaticCache or ETCustomHybridCache.
     This modifies the model in place.
 
     Args:
@@ -342,18 +339,18 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(static_cache)
         else:
             module.static_cache = ETCustomStaticCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Dont know why we need to this even though
@@ -370,25 +367,25 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
         if getattr(module, "replace_cache", None) is not None:
             hybrid_cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             module.replace_cache(hybrid_cache)
         else:
             module.cache = ETCustomHybridCache(
                 config=config,
-                max_batch_size=generation_config.cache_config.batch_size,
-                max_cache_len=generation_config.cache_config.max_cache_len,
-                device=generation_config.cache_config.device,
+                max_batch_size=generation_config.cache_config.get("batch_size"),
+                max_cache_len=generation_config.cache_config.get("max_cache_len"),
+                device=generation_config.cache_config.get("device"),
                 dtype=cache_dtype,
             )
             # Register cache attributes for each layer
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding[i]:
+                if module.cache.layers[i].is_sliding:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -24,6 +24,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageClassification,
@@ -185,6 +186,13 @@ def _from_pretrained(
             subfolder=subfolder,
             local_files_only=local_files_only,
         )
+
+        from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
+        from executorch.kernels import quantized  # noqa
+        from executorch.extension.pybindings.portable_lib import _get_operator_names
+        print("----------- LOADED OPS ----------")
+        print('\n'.join(_get_operator_names()))
+        print("---------------------------------")
         model = _load_for_executorch(model_cache_path)
         logging.info(
             f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"

diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -28,6 +28,7 @@
 )
 from transformers.generation.configuration_utils import GenerationConfig
 
+from executorch import version as executorch_version
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 from optimum.utils.import_utils import is_transformers_version
 
@@ -89,7 +90,10 @@ def _prepare_export_inputs(self):
         return example_input_ids, example_cache_position, dynamic_shapes, strict
 
     def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
-        if is_transformers_version(">=", "4.53.0.dev0"):
+        if (
+            is_transformers_version(">=", "4.53.0.dev0")
+            and parse(executorch_version.__version__).base_version > "0.6.0"
+        ):
             from transformers.integrations.executorch import sdpa_mask_without_vmap
             from transformers.masking_utils import AttentionMaskInterface
             from transformers.modeling_utils import AttentionInterface
@@ -126,7 +130,7 @@ def export(
             )
             self._register_attention_mask_for_4_53(exportable_module)
 
-            if self.use_custom_kv_cache:
+            if self.use_custom_kv_cache and parse(executorch_version.__version__).base_version > "0.6.0":
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
                 )
@@ -395,8 +399,8 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
                 model=self.full_model,
-                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
-                batch_size=self.generation_config.cache_config.batch_size,
+                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
+                batch_size=self.generation_config.cache_config.get("batch_size"),
             )
             .to("cpu")
             .eval()

diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
@@ -26,10 +26,6 @@ def quantize_model_(
     if not (qlinear_config or qembedding_config):
         return
 
-    # TODO: Update torchao to use 0.11.0 once released
-    if parse(torchao.__version__) < parse("0.11.0.dev0"):
-        raise RuntimeError("Quantization requires torchao >= 0.11.0. Please upgrade torchao.")
-
     from torchao.quantization.granularity import PerAxis, PerGroup
     from torchao.quantization.quant_api import (
         Int8DynamicActivationIntxWeightConfig,

diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -53,8 +53,8 @@ def save_config_to_constant_methods(
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
-            max_batch_size = getattr(cache_config, "batch_size", None)
-            max_seq_len = getattr(cache_config, "max_cache_len", None)
+            max_batch_size = cache_config.get("batch_size")
+            max_seq_len = cache_config.get("max_cache_len")
 
             if max_batch_size is not None:
                 metadata["get_max_batch_size"] = max_batch_size

diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 INSTALL_REQUIRE = [
     "optimum~=1.24",
     "executorch>=0.6.0",
-    "transformers==4.51.3",
+    "transformers==4.54.1",
 ]
 
 TESTS_REQUIRE = [