Feature/intermediates cache prefetch (#2392)

GOavi101 · Avishek Goswami · HDCharles · web-flow · commit 353f556aee9a · 2026-03-17T15:29:20.000-04:00
Optional prefetch was added to the intermediates cache and wired into AWQ when offloading. IntermediatesCache New method iter_prefetch() iterates over batches like iter() but prefetches the next batch in a background thread so onload from the offload device overlaps with use of the current batch, reducing wall‑clock time when offloading to CPU. **AWQ** When offload_device is set, _run_samples() uses cache.iter_prefetch() instead of the cache iterator so CPU→device onload overlaps with the forward pass over cached parent args during smoothing. **Tests** Two tests were added: one that prefetch yields the same batches as iter(), and one that prefetch on an empty cache yields nothing. No new public API; prefetch is used automatically when AWQ offloads. Fix: #2374 --------- Signed-off-by: Avishek Goswami <avishek.goswami@ibm.com> Co-authored-by: Avishek Goswami <avishek.goswami@ibm.com> Co-authored-by: HDCharles <39544797+HDCharles@users.noreply.github.com>
diff --git a/src/llmcompressor/core/state.py b/src/llmcompressor/core/state.py
@@ -104,6 +104,7 @@ class State:
     hardware: Hardware = field(default_factory=Hardware)
     loss_masks: list[torch.Tensor] | None = None
     current_batch_idx: int = -1
+    sequential_prefetch: bool = False
 
     @property
     def compression_ready(self) -> bool:
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -607,9 +607,10 @@ def _smooth(
 
     @torch.no_grad()
     def _run_samples(self, module: Module) -> list[torch.Tensor]:
-        outputs = [
-            module(**batch_kwargs) for batch_kwargs in self._parent_args_cache[module]
-        ]
+        cache = self._parent_args_cache[module]
+        use_prefetch = active_session().state.sequential_prefetch
+        batch_iter = cache.iter_prefetch() if use_prefetch else cache
+        outputs = [module(**batch_kwargs) for batch_kwargs in batch_iter]
         return [
             # If tuple, assume that first argument is the input
             output[0] if isinstance(output, tuple) else output
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -3,6 +3,7 @@
 import sys
 import warnings
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, fields, is_dataclass
 from typing import Any, Generator
 from weakref import WeakKeyDictionary
@@ -196,6 +197,59 @@ def iter(self, input_names: list[str] | None = None) -> Generator[Any, None, Non
         for batch_index in range(len(self.batch_intermediates)):
             yield self.fetch(batch_index, input_names)
 
+    def iter_prefetch(
+        self, input_names: list[str] | None = None
+    ) -> Generator[Any, None, None]:
+        """
+        Iterate over batches with the next batch prefetched in a background thread.
+        Overlaps onload from offload_device with consumption of the current batch,
+        which can reduce wall-clock time when offloading to CPU.
+
+        When CUDA is available, uses non_blocking transfers (requires pinned CPU
+        tensors, set up by _offload_value) and synchronises via CUDA events so the
+        main stream waits for each H2D copy before running GPU kernels on the data.
+
+        Yields the same fetched batch dicts as :meth:`iter`; only the timing
+        of onloads differs.
+        """
+        num_batches = len(self.batch_intermediates)
+        if num_batches == 0:
+            return
+
+        # Create a dedicated CUDA stream for H2D transfers so they run on a
+        # separate stream from the main thread's compute stream. Without this,
+        # both threads default to the null stream (stream 0) which serializes
+        # all operations and prevents any overlap.
+        h2d_stream = torch.cuda.Stream() if torch.cuda.is_available() else None
+
+        def _fetch_and_record(batch_index):
+            event = None
+            if h2d_stream is not None:
+                with torch.cuda.stream(h2d_stream):
+                    data = self.fetch(batch_index, input_names)
+                event = torch.cuda.Event()
+                event.record(h2d_stream)
+            else:
+                data = self.fetch(batch_index, input_names)
+            return data, event
+
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = None
+            for batch_index in range(num_batches):
+                if future is not None:
+                    current, event = future.result()
+                else:
+                    current, event = _fetch_and_record(batch_index)
+                if batch_index + 1 < num_batches:
+                    future = executor.submit(_fetch_and_record, batch_index + 1)
+                else:
+                    future = None
+                # Make the main CUDA stream wait for the background H2D copy
+                # before any GPU kernel consumes the prefetched tensors
+                if event is not None:
+                    torch.cuda.current_stream().wait_event(event)
+                yield current
+
     def __iter__(self) -> Generator[Any, None, None]:
         yield from self.iter()
 
@@ -215,7 +269,14 @@ def _onload_value(cls, intermediate: IntermediateValue) -> Any:
 
         match value:
             case torch.Tensor():
-                return value.to(device=device)
+                # use non_blocking when source is pinned and target is CUDA so the
+                # H2D DMA can overlap with GPU compute on a separate CUDA stream
+                non_blocking = (
+                    value.is_pinned()
+                    and device is not None
+                    and torch.device(device).type == "cuda"
+                )
+                return value.to(device=device, non_blocking=non_blocking)
             case list():
                 return [cls._onload_value(v) for v in value]
             case tuple():
@@ -259,6 +320,13 @@ def _offload_value(
                         # move to offload if no hit
                         offloaded = value.to(device=offload_device)
                         if offloaded is not value:  # avoid circular ref
+                            # pin CPU tensors so onload can use non_blocking DMA
+                            if (
+                                torch.device(offload_device).type == "cpu"
+                                and torch.cuda.is_available()
+                                and not offloaded.is_pinned()
+                            ):
+                                offloaded = offloaded.pin_memory()
                             cls.offload_values[value] = offloaded
 
                 return IntermediateValue(
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,5 +1,4 @@
 import contextlib
-from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Iterator
 
 import torch
@@ -35,30 +34,23 @@ def _get_batches(
     num_batches: int,
     input_names: list[str],
     desc: str,
-    use_prefetch: bool = False,
+    sequential_prefetch: bool = False,
 ) -> Iterator[tuple[int, dict]]:
     """
     Yield (batch_idx, inputs) with the next batch optionally prefetched in a
     background thread to overlap fetch (onload from offload device) with the
-    main-thread forward pass.
+    main-thread forward pass. Delegates to
+    :meth:`IntermediatesCache.iter_prefetch` when prefetching is enabled.
     """
-    if not use_prefetch:
-        for batch_idx in tqdm(range(num_batches), desc=desc):
-            inputs = activations.fetch(batch_idx, input_names)
-            yield batch_idx, inputs
-        return
-    with ThreadPoolExecutor(max_workers=1) as executor:
-        future = None
-        for batch_idx in tqdm(range(num_batches), desc=desc):
-            if future is not None:
-                inputs = future.result()
-            else:
-                inputs = activations.fetch(batch_idx, input_names)
-            if batch_idx + 1 < num_batches:
-                future = executor.submit(activations.fetch, batch_idx + 1, input_names)
-            else:
-                future = None
-            yield batch_idx, inputs
+    batch_source = (
+        activations.iter_prefetch(input_names)
+        if sequential_prefetch
+        else activations.iter(input_names)
+    )
+    for batch_idx, inputs in tqdm(
+        enumerate(batch_source), total=num_batches, desc=desc
+    ):
+        yield batch_idx, inputs
 
 
 @CalibrationPipeline.register("sequential")
@@ -139,22 +131,24 @@ def __call__(
             else:
                 session.state.loss_masks = None
 
+            sequential_prefetch = getattr(dataset_args, "sequential_prefetch", False)
+            session.state.sequential_prefetch = sequential_prefetch
+
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
                 calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
                 num_batches = len(dataloader)
-                use_prefetch = getattr(dataset_args, "sequential_prefetch", False)
                 with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx, inputs in _get_batches(
                         activations,
                         num_batches,
                         subgraph.input_names,
                         calib_desc,
-                        use_prefetch,
+                        sequential_prefetch,
                     ):
                         session.state.current_batch_idx = batch_idx
                         subgraph.forward(model, **inputs)
@@ -169,7 +163,7 @@ def __call__(
                             num_batches,
                             subgraph.input_names,
                             prop_desc,
-                            use_prefetch,
+                            sequential_prefetch,
                         ):
                             output = subgraph.forward(model, **inputs)
                             if subgraph_index < num_subgraphs - 1:
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -53,6 +53,29 @@ def test_initialization(sample_dataloader):
     assert isinstance(cache.batch_intermediates[0], dict)
 
 
+@pytest.mark.unit
+def test_iter_prefetch_empty_cache():
+    """iter_prefetch yields nothing when cache has no batches."""
+    cache = IntermediatesCache.empty(0, torch.device("cpu"))
+    assert list(cache.iter_prefetch()) == []
+
+
+@pytest.mark.unit
+def test_iter_prefetch_matches_iter(sample_cache):
+    """iter_prefetch yields the same batch contents as iter."""
+
+    def batch_dicts_equal(a: dict, b: dict) -> bool:
+        if set(a.keys()) != set(b.keys()):
+            return False
+        return all(deep_equal(a[k], b[k]) for k in a)
+
+    via_iter = list(sample_cache.iter())
+    via_prefetch = list(sample_cache.iter_prefetch())
+    assert len(via_iter) == len(via_prefetch)
+    for i, (b_iter, b_prefetch) in enumerate(zip(via_iter, via_prefetch)):
+        assert batch_dicts_equal(b_iter, b_prefetch), f"batch {i} differs"
+
+
 @pytest.mark.unit
 def test_fetch_inputs(sample_cache):
     fetched = sample_cache.fetch(0, ["input_ids", "attention_mask"])