[Sequential Pipeline] only cache unique offloaded values (#2366)

kylesayrs · brian-dellabetta · web-flow · commit 0d556a7da6c0 · 2026-02-17T09:37:24.000-05:00
Updated by @brian-dellabetta SUMMARY: The SequentialPipeline offloads subgraph outputs as part of normal usage. Occasionally these outputs share duplicates in kwargs that point to the same memory location on the onloaded device. When offloading is enabled, there was previously no check to see if any tensors to be offloaded had already previously been offloaded, which can cause a huge increase in memory requirements in some models, as reported in #2363. This PR - [x] adds an offload map to IntermediatesCache to ensure tensors are not redundantly offloaded - [x] wraps the map in an override to ensure `torch.equal` is used rather than `torch.eq` (which is the one used with `==` checks). `torch.eq` can return multiple boolean values depending on the tensors being compared, resulting in an error. This override, which should only be used when the tensors are immutable (the case here), allows us to retain the original hashing function and have an `O(1)` lookup. Our other attempts to circumvent the issue added to runtime or required `O(N)` lookup. Resolves #2363 TEST PLAN: - [x] Unit test added for `OverrideEqMode` - [x] Script from #2363 runs with ~81GB CPU RAM after first layer propagation, increased to ~88GB CPU RAM used by layer 11/49, and then stays consistently <89GB CPU RAM used by layer 25/49. On current main, this script would hit ~750GB CPU RAM usage during first layer propagastion --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Brian Dellabetta <bdellabe@redhat.com> Signed-off-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com> Co-authored-by: Brian Dellabetta <bdellabe@redhat.com>
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -5,11 +5,44 @@
 from collections import defaultdict
 from dataclasses import dataclass, fields, is_dataclass
 from typing import Any, Generator
+from weakref import WeakKeyDictionary
 
 import torch
+from torch.utils._python_dispatch import TorchDispatchMode
 from tqdm import tqdm
 
 
+class OverrideEqMode(TorchDispatchMode):
+    """
+    When using a torch.Tensor as a key in a dictionary, the equality
+    check must return a single value instead of a torch.Tensor
+    of bool values.
+    Use this override context for such cases, to swap out the torch.eq
+    equality check for a check on id
+    >>> a = torch.tensor([1,2,3])
+    >>> b = torch.tensor([1,2,3])
+    >>> a == b
+    tensor([True, True, True])
+    >>> with OverrideEqMode():
+    ...     a == b
+    tensor(True)
+    """
+
+    def __torch_dispatch__(self, func, _types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        # Check if the operation is equality
+        if func is torch.ops.aten.eq.Tensor:
+            # Override to use torch.equal
+            assert len(args) == 2, "Exactly 2 args must be provided"
+
+            # NOTE: Errors out without cast to torch.tensor
+            return torch.tensor(id(args[0]) == id(args[1]))
+
+        # For all other operations, just run them normally
+        return func(*args, **kwargs)
+
+
 @dataclass
 class IntermediateValue:
     """
@@ -42,6 +75,10 @@ class IntermediatesCache:
     batch_intermediates: list[IntermediateValues]
     offload_device: torch.device | None
 
+    # map of onload value -> offload value
+    # used to avoid excess memory usage when shared tensors are offloaded
+    offload_values: WeakKeyDictionary[torch.Tensor, torch.Tensor] = WeakKeyDictionary()
+
     def __init__(
         self,
         batch_intermediates: list[IntermediateValues] | None = None,
@@ -154,13 +191,16 @@ def size(self) -> dict[torch.device, int]:
         :return: dictionary mapping torch device to number of bytes in cache
         """
         sizes = defaultdict(lambda: 0)
+        memo = set()
 
         def _size_helper(intermediate: IntermediateValue) -> int:
             value = intermediate.value
 
             match value:
                 case torch.Tensor():
-                    sizes[value.device] += value.nbytes
+                    if value not in memo:
+                        sizes[value.device] += value.nbytes
+                    memo.add(value)
                 case list() | tuple():
                     for v in value:
                         _size_helper(v)
@@ -239,8 +279,17 @@ def _offload_value(
         kwargs = {"offload_device": offload_device, "onload_device": onload_device}
         match value:
             case torch.Tensor():
+                with OverrideEqMode():
+                    # check for cache hit between shared tensors
+                    if value in cls.offload_values:
+                        offloaded = cls.offload_values[value]
+                    else:
+                        # move to offload if no hit
+                        offloaded = value.to(device=offload_device)
+                        cls.offload_values[value] = offloaded
+
                 return IntermediateValue(
-                    value=value.to(device=offload_device),
+                    value=offloaded,
                     device=(onload_device if onload_device else value.device),
                 )
             case list():
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -4,7 +4,7 @@
 import torch
 from torch.utils.data import DataLoader, StackDataset
 
-from llmcompressor.pipelines.cache import IntermediatesCache
+from llmcompressor.pipelines.cache import IntermediatesCache, OverrideEqMode
 
 
 @dataclass
@@ -162,3 +162,18 @@ def deep_equal(a, b) -> bool:
             return deep_equal(a_dict, b_dict)
         case _:
             return a == b
+
+
+def test_override_eq_mode():
+    a = torch.tensor([1, 2, 3])
+    b = a
+    c = torch.tensor([2, 2, 2])
+
+    with pytest.raises(RuntimeError):
+        assert a == b
+    with pytest.raises(RuntimeError):
+        assert not (a == c)
+
+    with OverrideEqMode():
+        assert a == b
+        assert not (a == c)