Add optional prefetch to intermediates cache; enable for AWQ when offloading

Avishek Goswami · Avishek Goswami · commit 457af3e880a4 · 2026-02-22T13:48:06.000+05:30
- IntermediatesCache.iter_prefetch() overlaps onload of next batch with
  consumption of current batch via a background thread
- AWQ _run_samples uses iter_prefetch when offload_device is set to
  overlap CPU-&gt;device transfer with module forward passes
- Add test_iter_prefetch_matches_iter to verify prefetch yields same results as iter

Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -608,6 +608,7 @@ def _smooth(
     @torch.no_grad()
     def _run_samples(self, module: Module) -> list[torch.Tensor]:
         cache = self._parent_args_cache[module]
+        # When offloading, prefetch overlaps CPU->device onload with forward pass.
         batch_iter = (
             cache.iter_prefetch()
             if self.offload_device is not None
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -203,7 +203,7 @@ def iter_prefetch(
         """
         Iterate over batches with the next batch prefetched in a background thread.
         Overlaps onload from offload_device with consumption of the current batch,
-        which can reduce latency when offloading to CPU.
+        which can reduce wall-clock time when offloading to CPU.
 
         Yields the same fetched batch dicts as :meth:`iter`; only the timing
         of onloads differs.
@@ -215,15 +215,16 @@ def iter_prefetch(
             future = None
             for batch_index in range(num_batches):
                 if future is not None:
-                    yield future.result()
+                    current = future.result()
                 else:
-                    yield self.fetch(batch_index, input_names)
+                    current = self.fetch(batch_index, input_names)
                 if batch_index + 1 < num_batches:
                     future = executor.submit(
                         self.fetch, batch_index + 1, input_names
                     )
                 else:
                     future = None
+                yield current
 
     def __iter__(self) -> Generator[Any, None, None]:
         yield from self.iter()
diff --git a/tests/llmcompressor/pipelines/test_cache.py b/tests/llmcompressor/pipelines/test_cache.py
@@ -53,6 +53,13 @@ def test_initialization(sample_dataloader):
     assert isinstance(cache.batch_intermediates[0], dict)
 
 
+@pytest.mark.unit
+def test_iter_prefetch_empty_cache():
+    """iter_prefetch yields nothing when cache has no batches."""
+    cache = IntermediatesCache.empty(0, torch.device("cpu"))
+    assert list(cache.iter_prefetch()) == []
+
+
 @pytest.mark.unit
 def test_iter_prefetch_matches_iter(sample_cache):
     """iter_prefetch yields the same batch contents as iter."""