Address review: move deep_equal, make prefetch a global setting via State

Avishek Goswami · Avishek Goswami · commit b6a3282b05ef · 2026-03-17T15:27:11.000+05:30
Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/core/state.py b/src/llmcompressor/core/state.py
@@ -114,6 +114,7 @@ class State:
     _last_log_step: float | int | None = None
     loss_masks: list[torch.Tensor] | None = None
     current_batch_idx: int = -1
+    sequential_prefetch: bool = False
 
     @property
     def compression_ready(self) -> bool:
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -138,10 +138,6 @@ class AWQModifier(Modifier, QuantizationMixin):
         requirements but requires more time to move data between cpu and execution
         device. Defaults to None, so cached args are not offloaded. Consider setting
         to torch.device("cpu") if you are encountering OOM errors
-    :param prefetch: when offloading, prefetch the next batch in a background thread
-        to overlap CPU-to-device onload with the forward pass, reducing wall-clock
-        time. Default False; set True when offload_device is set and GPU memory
-        allows two batches on device simultaneously
     :param duo_scaling: whether to use duo scaling, which uses both input activations
         and weights to determine the scaling factor. Defaults to True
         If True, both activations and weights are used.
@@ -161,7 +157,6 @@ class AWQModifier(Modifier, QuantizationMixin):
     sequential_targets: str | list[str] | None = None
     mappings: list[AWQMapping] | None = None
     offload_device: torch.device | None | Sentinel = Sentinel("not_provided")
-    prefetch: bool = False
     duo_scaling: bool | Literal["both"] = True
     n_grid: int = 20
 
@@ -613,8 +608,8 @@ def _smooth(
     @torch.no_grad()
     def _run_samples(self, module: Module) -> list[torch.Tensor]:
         cache = self._parent_args_cache[module]
-        # Prefetch overlaps CPU->device onload with forward pass when offloading.
-        batch_iter = cache.iter_prefetch() if self.prefetch else cache
+        use_prefetch = active_session().state.sequential_prefetch
+        batch_iter = cache.iter_prefetch() if use_prefetch else cache
         outputs = [module(**batch_kwargs) for batch_kwargs in batch_iter]
         return [
             # If tuple, assume that first argument is the input
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -205,23 +205,49 @@ def iter_prefetch(
         Overlaps onload from offload_device with consumption of the current batch,
         which can reduce wall-clock time when offloading to CPU.
 
+        When CUDA is available, uses non_blocking transfers (requires pinned CPU
+        tensors, set up by _offload_value) and synchronises via CUDA events so the
+        main stream waits for each H2D copy before running GPU kernels on the data.
+
         Yields the same fetched batch dicts as :meth:`iter`; only the timing
         of onloads differs.
         """
         num_batches = len(self.batch_intermediates)
         if num_batches == 0:
             return
+
+        # Create a dedicated CUDA stream for H2D transfers so they run on a
+        # separate stream from the main thread's compute stream. Without this,
+        # both threads default to the null stream (stream 0) which serializes
+        # all operations and prevents any overlap.
+        h2d_stream = torch.cuda.Stream() if torch.cuda.is_available() else None
+
+        def _fetch_and_record(batch_index):
+            event = None
+            if h2d_stream is not None:
+                with torch.cuda.stream(h2d_stream):
+                    data = self.fetch(batch_index, input_names)
+                event = torch.cuda.Event()
+                event.record(h2d_stream)
+            else:
+                data = self.fetch(batch_index, input_names)
+            return data, event
+
         with ThreadPoolExecutor(max_workers=1) as executor:
             future = None
             for batch_index in range(num_batches):
                 if future is not None:
-                    current = future.result()
+                    current, event = future.result()
                 else:
-                    current = self.fetch(batch_index, input_names)
+                    current, event = _fetch_and_record(batch_index)
                 if batch_index + 1 < num_batches:
-                    future = executor.submit(self.fetch, batch_index + 1, input_names)
+                    future = executor.submit(_fetch_and_record, batch_index + 1)
                 else:
                     future = None
+                # Make the main CUDA stream wait for the background H2D copy
+                # before any GPU kernel consumes the prefetched tensors
+                if event is not None:
+                    torch.cuda.current_stream().wait_event(event)
                 yield current
 
     def __iter__(self) -> Generator[Any, None, None]:
@@ -243,7 +269,14 @@ def _onload_value(cls, intermediate: IntermediateValue) -> Any:
 
         match value:
             case torch.Tensor():
-                return value.to(device=device)
+                # use non_blocking when source is pinned and target is CUDA so the
+                # H2D DMA can overlap with GPU compute on a separate CUDA stream
+                non_blocking = (
+                    value.is_pinned()
+                    and device is not None
+                    and torch.device(device).type == "cuda"
+                )
+                return value.to(device=device, non_blocking=non_blocking)
             case list():
                 return [cls._onload_value(v) for v in value]
             case tuple():
@@ -287,6 +320,13 @@ def _offload_value(
                         # move to offload if no hit
                         offloaded = value.to(device=offload_device)
                         if offloaded is not value:  # avoid circular ref
+                            # pin CPU tensors so onload can use non_blocking DMA
+                            if (
+                                torch.device(offload_device).type == "cpu"
+                                and torch.cuda.is_available()
+                                and not offloaded.is_pinned()
+                            ):
+                                offloaded = offloaded.pin_memory()
                             cls.offload_values[value] = offloaded
 
                 return IntermediateValue(
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -34,7 +34,7 @@ def _get_batches(
     num_batches: int,
     input_names: list[str],
     desc: str,
-    use_prefetch: bool = False,
+    sequential_prefetch: bool = False,
 ) -> Iterator[tuple[int, dict]]:
     """
     Yield (batch_idx, inputs) with the next batch optionally prefetched in a
@@ -44,7 +44,7 @@ def _get_batches(
     """
     batch_source = (
         activations.iter_prefetch(input_names)
-        if use_prefetch
+        if sequential_prefetch
         else activations.iter(input_names)
     )
     for batch_idx, inputs in tqdm(
@@ -131,22 +131,24 @@ def __call__(
             else:
                 session.state.loss_masks = None
 
+            sequential_prefetch = getattr(dataset_args, "sequential_prefetch", False)
+            session.state.sequential_prefetch = sequential_prefetch
+
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
                 calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
                 num_batches = len(dataloader)
-                use_prefetch = getattr(dataset_args, "sequential_prefetch", False)
                 with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx, inputs in _get_batches(
                         activations,
                         num_batches,
                         subgraph.input_names,
                         calib_desc,
-                        use_prefetch,
+                        sequential_prefetch,
                     ):
                         session.state.current_batch_idx = batch_idx
                         subgraph.forward(model, **inputs)
@@ -161,7 +163,7 @@ def __call__(
                             num_batches,
                             subgraph.input_names,
                             prop_desc,
-                            use_prefetch,
+                            sequential_prefetch,
                         ):
                             output = subgraph.forward(model, **inputs)
                             if subgraph_index < num_subgraphs - 1:
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -76,30 +76,6 @@ def batch_dicts_equal(a: dict, b: dict) -> bool:
         assert batch_dicts_equal(b_iter, b_prefetch), f"batch {i} differs"
 
 
-def deep_equal(a, b) -> bool:
-    if type(a) is not type(b):
-        return False
-
-    match a:
-        case torch.Tensor():
-            return torch.equal(a, b)
-        case list() | tuple():
-            if len(a) != len(b):
-                return False
-            return all(deep_equal(_a, _b) for _a, _b in zip(a, b))
-        case dict():
-            if a.keys() != b.keys():
-                return False
-            return all(deep_equal(a[key], b[key]) for key in a.keys())
-        case _ if is_dataclass(a):
-            a_dict = {field.name: getattr(a, field.name) for field in fields(a)}
-            b_dict = {field.name: getattr(b, field.name) for field in fields(b)}
-
-            return deep_equal(a_dict, b_dict)
-        case _:
-            return a == b
-
-
 @pytest.mark.unit
 def test_fetch_inputs(sample_cache):
     fetched = sample_cache.fetch(0, ["input_ids", "attention_mask"])
@@ -187,6 +163,30 @@ def test_device_handling(sample_dataloader):
     assert fetched["hidden_states"].device.type == "cuda"
 
 
+def deep_equal(a, b) -> bool:
+    if type(a) is not type(b):
+        return False
+
+    match a:
+        case torch.Tensor():
+            return torch.equal(a, b)
+        case list() | tuple():
+            if len(a) != len(b):
+                return False
+            return all(deep_equal(_a, _b) for _a, _b in zip(a, b))
+        case dict():
+            if a.keys() != b.keys():
+                return False
+            return all(deep_equal(a[key], b[key]) for key in a.keys())
+        case _ if is_dataclass(a):
+            a_dict = {field.name: getattr(a, field.name) for field in fields(a)}
+            b_dict = {field.name: getattr(b, field.name) for field in fields(b)}
+
+            return deep_equal(a_dict, b_dict)
+        case _:
+            return a == b
+
+
 def test_override_eq_mode():
     a = torch.tensor([1, 2, 3])
     b = a