Address review: move deep_equal, make prefetch a global setting via State

Avishek Goswami · Avishek Goswami · commit 22b62b925ff7 · 2026-03-17T13:30:57.000+05:30
Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/core/state.py b/src/llmcompressor/core/state.py
@@ -114,6 +114,7 @@ class State:
     _last_log_step: float | int | None = None
     loss_masks: list[torch.Tensor] | None = None
     current_batch_idx: int = -1
+    sequential_prefetch: bool = False
 
     @property
     def compression_ready(self) -> bool:
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -138,10 +138,6 @@ class AWQModifier(Modifier, QuantizationMixin):
         requirements but requires more time to move data between cpu and execution
         device. Defaults to None, so cached args are not offloaded. Consider setting
         to torch.device("cpu") if you are encountering OOM errors
-    :param prefetch: when offloading, prefetch the next batch in a background thread
-        to overlap CPU-to-device onload with the forward pass, reducing wall-clock
-        time. Default False; set True when offload_device is set and GPU memory
-        allows two batches on device simultaneously
     :param duo_scaling: whether to use duo scaling, which uses both input activations
         and weights to determine the scaling factor. Defaults to True
         If True, both activations and weights are used.
@@ -161,7 +157,6 @@ class AWQModifier(Modifier, QuantizationMixin):
     sequential_targets: str | list[str] | None = None
     mappings: list[AWQMapping] | None = None
     offload_device: torch.device | None | Sentinel = Sentinel("not_provided")
-    prefetch: bool = False
     duo_scaling: bool | Literal["both"] = True
     n_grid: int = 20
 
@@ -613,8 +608,8 @@ def _smooth(
     @torch.no_grad()
     def _run_samples(self, module: Module) -> list[torch.Tensor]:
         cache = self._parent_args_cache[module]
-        # Prefetch overlaps CPU->device onload with forward pass when offloading.
-        batch_iter = cache.iter_prefetch() if self.prefetch else cache
+        use_prefetch = active_session().state.sequential_prefetch
+        batch_iter = cache.iter_prefetch() if use_prefetch else cache
         outputs = [module(**batch_kwargs) for batch_kwargs in batch_iter]
         return [
             # If tuple, assume that first argument is the input
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -34,7 +34,7 @@ def _get_batches(
     num_batches: int,
     input_names: list[str],
     desc: str,
-    use_prefetch: bool = False,
+    sequential_prefetch: bool = False,
 ) -> Iterator[tuple[int, dict]]:
     """
     Yield (batch_idx, inputs) with the next batch optionally prefetched in a
@@ -44,7 +44,7 @@ def _get_batches(
     """
     batch_source = (
         activations.iter_prefetch(input_names)
-        if use_prefetch
+        if sequential_prefetch
         else activations.iter(input_names)
     )
     for batch_idx, inputs in tqdm(
@@ -131,22 +131,24 @@ def __call__(
             else:
                 session.state.loss_masks = None
 
+            sequential_prefetch = getattr(dataset_args, "sequential_prefetch", False)
+            session.state.sequential_prefetch = sequential_prefetch
+
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
                 calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
                 num_batches = len(dataloader)
-                use_prefetch = getattr(dataset_args, "sequential_prefetch", False)
                 with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx, inputs in _get_batches(
                         activations,
                         num_batches,
                         subgraph.input_names,
                         calib_desc,
-                        use_prefetch,
+                        sequential_prefetch,
                     ):
                         session.state.current_batch_idx = batch_idx
                         subgraph.forward(model, **inputs)
@@ -161,7 +163,7 @@ def __call__(
                             num_batches,
                             subgraph.input_names,
                             prop_desc,
-                            use_prefetch,
+                            sequential_prefetch,
                         ):
                             output = subgraph.forward(model, **inputs)
                             if subgraph_index < num_subgraphs - 1:
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -76,30 +76,6 @@ def batch_dicts_equal(a: dict, b: dict) -> bool:
         assert batch_dicts_equal(b_iter, b_prefetch), f"batch {i} differs"
 
 
-def deep_equal(a, b) -> bool:
-    if type(a) is not type(b):
-        return False
-
-    match a:
-        case torch.Tensor():
-            return torch.equal(a, b)
-        case list() | tuple():
-            if len(a) != len(b):
-                return False
-            return all(deep_equal(_a, _b) for _a, _b in zip(a, b))
-        case dict():
-            if a.keys() != b.keys():
-                return False
-            return all(deep_equal(a[key], b[key]) for key in a.keys())
-        case _ if is_dataclass(a):
-            a_dict = {field.name: getattr(a, field.name) for field in fields(a)}
-            b_dict = {field.name: getattr(b, field.name) for field in fields(b)}
-
-            return deep_equal(a_dict, b_dict)
-        case _:
-            return a == b
-
-
 @pytest.mark.unit
 def test_fetch_inputs(sample_cache):
     fetched = sample_cache.fetch(0, ["input_ids", "attention_mask"])
@@ -187,6 +163,30 @@ def test_device_handling(sample_dataloader):
     assert fetched["hidden_states"].device.type == "cuda"
 
 
+def deep_equal(a, b) -> bool:
+    if type(a) is not type(b):
+        return False
+
+    match a:
+        case torch.Tensor():
+            return torch.equal(a, b)
+        case list() | tuple():
+            if len(a) != len(b):
+                return False
+            return all(deep_equal(_a, _b) for _a, _b in zip(a, b))
+        case dict():
+            if a.keys() != b.keys():
+                return False
+            return all(deep_equal(a[key], b[key]) for key in a.keys())
+        case _ if is_dataclass(a):
+            a_dict = {field.name: getattr(a, field.name) for field in fields(a)}
+            b_dict = {field.name: getattr(b, field.name) for field in fields(b)}
+
+            return deep_equal(a_dict, b_dict)
+        case _:
+            return a == b
+
+
 def test_override_eq_mode():
     a = torch.tensor([1, 2, 3])
     b = a