[Bugfix] Fix circular references when activation offload device is cuda (#2387)

kylesayrs · HDCharles · web-flow · commit 2e469aa913c4 · 2026-02-19T00:59:14.000-05:00
## Background ## #2366 introduced a `WeakKeyDictionary` layer which caches shared tensors. This is a good approach, but has an edge case where, if the value of entry is identical to the key of the entry, then the key will never be garbage collected. This can occur if the user specifies `sequential_offload_device="cuda"`, or if the AWQ offload device is "cuda" (default true in most cases). ## Purpose ## * Fix memory leak in AWQ which led to very high CUDA memory usage ## Changes ## * Guard against entries into the `WeakKeyDictionary` where the key and value are identical * Misc * Move `OverrideEqMode` to the bottom of the `pipelines/cache.py` * Remove `_fp16_baseline_cache`, which was not being used ## Testing ## | Before Changes | After Changes | | - | - | | <img width="640" height="480" alt="awq_before" src="https://github.com/user-attachments/assets/07714321-4b2f-49b7-aa2b-5c745a60d2f4" /> | <img width="640" height="480" alt="awq_after" src="https://github.com/user-attachments/assets/336b0e98-c24c-4e0c-a873-3166effc32b7" /> | --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: HDCharles <39544797+HDCharles@users.noreply.github.com>
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -172,10 +172,6 @@ class AWQModifier(Modifier, QuantizationMixin):
     )
     # List to store error metrics for each layer
     _error_metrics: list[dict] = PrivateAttr(default_factory=list)
-    # Cache FP16 baseline outputs for each parent module, one list of tensors per batch
-    _fp16_baseline_cache: dict[Module, IntermediatesCache] = PrivateAttr(
-        default_factory=dict
-    )
 
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -12,37 +12,6 @@
 from tqdm import tqdm
 
 
-class OverrideEqMode(TorchDispatchMode):
-    """
-    When using a torch.Tensor as a key in a dictionary, the equality
-    check must return a single value instead of a torch.Tensor
-    of bool values.
-    Use this override context for such cases, to swap out the torch.eq
-    equality check for a check on id
-    >>> a = torch.tensor([1,2,3])
-    >>> b = torch.tensor([1,2,3])
-    >>> a == b
-    tensor([True, True, True])
-    >>> with OverrideEqMode():
-    ...     a == b
-    tensor(True)
-    """
-
-    def __torch_dispatch__(self, func, _types, args=(), kwargs=None):
-        kwargs = kwargs or {}
-
-        # Check if the operation is equality
-        if func is torch.ops.aten.eq.Tensor:
-            # Override to use torch.equal
-            assert len(args) == 2, "Exactly 2 args must be provided"
-
-            # NOTE: Errors out without cast to torch.tensor
-            return torch.tensor(id(args[0]) == id(args[1]))
-
-        # For all other operations, just run them normally
-        return func(*args, **kwargs)
-
-
 @dataclass
 class IntermediateValue:
     """
@@ -289,7 +258,8 @@ def _offload_value(
                     else:
                         # move to offload if no hit
                         offloaded = value.to(device=offload_device)
-                        cls.offload_values[value] = offloaded
+                        if offloaded is not value:  # avoid circular ref
+                            cls.offload_values[value] = offloaded
 
                 return IntermediateValue(
                     value=offloaded,
@@ -326,3 +296,34 @@ def _offload_value(
                 ):
                     warnings.warn(f"Offloading not implemented for type {type(value)}.")
                 return IntermediateValue(value=value, device=None)
+
+
+class OverrideEqMode(TorchDispatchMode):
+    """
+    When using a torch.Tensor as a key in a dictionary, the equality
+    check must return a single value instead of a torch.Tensor
+    of bool values.
+    Use this override context for such cases, to swap out the torch.eq
+    equality check for a check on id
+    >>> a = torch.tensor([1,2,3])
+    >>> b = torch.tensor([1,2,3])
+    >>> a == b
+    tensor([True, True, True])
+    >>> with OverrideEqMode():
+    ...     a == b
+    tensor(True)
+    """
+
+    def __torch_dispatch__(self, func, _types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        # Check if the operation is equality
+        if func is torch.ops.aten.eq.Tensor:
+            # Override to use torch.equal
+            assert len(args) == 2, "Exactly 2 args must be provided"
+
+            # NOTE: Errors out without cast to torch.tensor
+            return torch.tensor(id(args[0]) == id(args[1]))
+
+        # For all other operations, just run them normally
+        return func(*args, **kwargs)