Added pytree registration inside dynamic cache class

romitjain · romitjain · commit f4213d4e0793 · 2025-08-14T08:46:39.000Z
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -994,6 +994,8 @@ class DynamicCache(Cache):
     ```
     """
 
+    _export_registered = False
+
     def __init__(
         self,
         ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None,
@@ -1047,6 +1049,34 @@ def __init__(
         else:
             super().__init__(layers=layers)
 
+        self._register_export_support()
+
+    @classmethod
+    def _register_export_support(cls):
+        """
+        Utilities for `DynamicCache` <> torch.export support
+        """
+        if cls._export_registered:
+            return
+
+        # Pytree registration causes memory leak for FSDP runs, see here: https://github.com/huggingface/transformers/issues/39795
+        if is_torch_greater_or_equal("2.3") and not is_fsdp_enabled():
+            torch.utils._pytree.register_pytree_node(
+                DynamicCache,
+                lambda dynamic_cache: torch.utils._pytree._dict_flatten(cls._get_cache_dict(dynamic_cache)),
+                cls._unflatten_dynamic_cache,
+                serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
+                flatten_with_keys_fn=lambda dynamic_cache: torch.utils._pytree._dict_flatten_with_keys(
+                    cls._get_cache_dict(dynamic_cache)
+                ),
+            )
+            # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
+            torch.fx._pytree.register_pytree_flatten_spec(
+                DynamicCache, lambda cache, spec: torch.fx._pytree._dict_flatten_spec(cls._get_cache_dict(cache), spec)
+            )
+
+            cls._export_registered = True
+
     def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
@@ -1070,12 +1100,9 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens
                 cache.update(key_states, value_states, layer_idx)
         return cache
 
-
-# Utilities for `DynamicCache` <> torch.export support
-# Pytree registration is not supported for FSDP runs, see here: https://github.com/huggingface/transformers/issues/39795
-if is_torch_greater_or_equal("2.3") and not is_fsdp_enabled():
-
-    def _get_cache_dict(cache: DynamicCache):
+    @staticmethod
+    def _get_cache_dict(cache):
+        """Convert cache to dictionary format for pytree operations."""
         if any(not isinstance(layer, (DynamicLayer, DynamicSlidingWindowLayer)) for layer in cache.layers):
             raise RuntimeError("This pytree flattening function should only be applied to DynamicCache")
 
@@ -1089,12 +1116,10 @@ def _get_cache_dict(cache: DynamicCache):
             "value_cache": [layer.values for layer in cache.layers if layer.values is not None],
         }
 
-    def _unflatten_dynamic_cache(
-        values,
-        context: torch.utils._pytree.Context,
-    ):
+    @classmethod
+    def _unflatten_dynamic_cache(cls, values, context: torch.utils._pytree.Context):
         dictionary = torch.utils._pytree._dict_unflatten(values, context)
-        cache = DynamicCache()
+        cache = cls()
         # Reconstruct layers from keys and values lists
         key_list = dictionary.get("key_cache", [])
         value_list = dictionary.get("value_cache", [])
@@ -1104,20 +1129,6 @@ def _unflatten_dynamic_cache(
             cache.update(key, value, idx)
         return cache
 
-    torch.utils._pytree.register_pytree_node(
-        DynamicCache,
-        lambda dynamic_cache: torch.utils._pytree._dict_flatten(_get_cache_dict(dynamic_cache)),
-        _unflatten_dynamic_cache,
-        serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
-        flatten_with_keys_fn=lambda dynamic_cache: torch.utils._pytree._dict_flatten_with_keys(
-            _get_cache_dict(dynamic_cache)
-        ),
-    )
-    # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
-    torch.fx._pytree.register_pytree_flatten_spec(
-        DynamicCache, lambda cache, spec: torch.fx._pytree._dict_flatten_spec(_get_cache_dict(cache), spec)
-    )
-
 
 class OffloadedCache(Cache):
     """