Add optional prefetch to intermediates cache; enable for AWQ when offloading

Avishek Goswami · Avishek Goswami · commit 9128c2b3a409 · 2026-03-05T23:35:57.000+05:30
- IntermediatesCache.iter_prefetch() overlaps onload of next batch with
  consumption of current batch via a background thread
- AWQ _run_samples uses iter_prefetch when offload_device is set to
  overlap CPU-&gt;device transfer with module forward passes
- Add test_iter_prefetch_matches_iter to verify prefetch yields same results as iter

Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -138,6 +138,10 @@ class AWQModifier(Modifier, QuantizationMixin):
         requirements but requires more time to move data between cpu and execution
         device. Defaults to None, so cached args are not offloaded. Consider setting
         to torch.device("cpu") if you are encountering OOM errors
+    :param prefetch: when offloading, prefetch the next batch in a background thread
+        to overlap CPU-to-device onload with the forward pass, reducing wall-clock
+        time. Default False; set True when offload_device is set and GPU memory
+        allows two batches on device simultaneously
     :param duo_scaling: whether to use duo scaling, which uses both input activations
         and weights to determine the scaling factor. Defaults to True
         If True, both activations and weights are used.
@@ -157,6 +161,7 @@ class AWQModifier(Modifier, QuantizationMixin):
     sequential_targets: str | list[str] | None = None
     mappings: list[AWQMapping] | None = None
     offload_device: torch.device | None | Sentinel = Sentinel("not_provided")
+    prefetch: bool = False
     duo_scaling: bool | Literal["both"] = True
     n_grid: int = 20
 
@@ -608,11 +613,8 @@ def _smooth(
     @torch.no_grad()
     def _run_samples(self, module: Module) -> list[torch.Tensor]:
         cache = self._parent_args_cache[module]
-        batch_iter = (
-            cache.iter_prefetch()
-            if self.offload_device is not None
-            else cache
-        )
+        # Prefetch overlaps CPU->device onload with forward pass when offloading.
+        batch_iter = cache.iter_prefetch() if self.prefetch else cache
         outputs = [module(**batch_kwargs) for batch_kwargs in batch_iter]
         return [
             # If tuple, assume that first argument is the input
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -203,7 +203,7 @@ def iter_prefetch(
         """
         Iterate over batches with the next batch prefetched in a background thread.
         Overlaps onload from offload_device with consumption of the current batch,
-        which can reduce latency when offloading to CPU.
+        which can reduce wall-clock time when offloading to CPU.
 
         Yields the same fetched batch dicts as :meth:`iter`; only the timing
         of onloads differs.
@@ -215,15 +215,14 @@ def iter_prefetch(
             future = None
             for batch_index in range(num_batches):
                 if future is not None:
-                    yield future.result()
+                    current = future.result()
                 else:
-                    yield self.fetch(batch_index, input_names)
+                    current = self.fetch(batch_index, input_names)
                 if batch_index + 1 < num_batches:
-                    future = executor.submit(
-                        self.fetch, batch_index + 1, input_names
-                    )
+                    future = executor.submit(self.fetch, batch_index + 1, input_names)
                 else:
                     future = None
+                yield current
 
     def __iter__(self) -> Generator[Any, None, None]:
         yield from self.iter()
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,5 +1,4 @@
 import contextlib
-from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Iterator
 
 import torch
@@ -40,25 +39,18 @@ def _get_batches(
     """
     Yield (batch_idx, inputs) with the next batch optionally prefetched in a
     background thread to overlap fetch (onload from offload device) with the
-    main-thread forward pass.
+    main-thread forward pass. Delegates to
+    :meth:`IntermediatesCache.iter_prefetch` when prefetching is enabled.
     """
-    if not use_prefetch:
-        for batch_idx in tqdm(range(num_batches), desc=desc):
-            inputs = activations.fetch(batch_idx, input_names)
-            yield batch_idx, inputs
-        return
-    with ThreadPoolExecutor(max_workers=1) as executor:
-        future = None
-        for batch_idx in tqdm(range(num_batches), desc=desc):
-            if future is not None:
-                inputs = future.result()
-            else:
-                inputs = activations.fetch(batch_idx, input_names)
-            if batch_idx + 1 < num_batches:
-                future = executor.submit(activations.fetch, batch_idx + 1, input_names)
-            else:
-                future = None
-            yield batch_idx, inputs
+    batch_source = (
+        activations.iter_prefetch(input_names)
+        if use_prefetch
+        else activations.iter(input_names)
+    )
+    for batch_idx, inputs in tqdm(
+        enumerate(batch_source), total=num_batches, desc=desc
+    ):
+        yield batch_idx, inputs
 
 
 @CalibrationPipeline.register("sequential")
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -53,6 +53,13 @@ def test_initialization(sample_dataloader):
     assert isinstance(cache.batch_intermediates[0], dict)
 
 
+@pytest.mark.unit
+def test_iter_prefetch_empty_cache():
+    """iter_prefetch yields nothing when cache has no batches."""
+    cache = IntermediatesCache.empty(0, torch.device("cpu"))
+    assert list(cache.iter_prefetch()) == []
+
+
 @pytest.mark.unit
 def test_iter_prefetch_matches_iter(sample_cache):
     """iter_prefetch yields the same batch contents as iter."""