cleanup and nits

kylesayrs · kylesayrs · commit ad71c5bf2178 · 2025-06-12T14:01:41.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -128,10 +128,6 @@ def __init__(
 
         # offload to cpu if possible
         if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            # TODO: consider renaming function similar to "offload_dispatch_model"
-            # TODO: modify function to remove any hooks if they already exist (making
-            # sure to move to cpu when removing hook
-            # TODO: remove hook in util
             remove_hook_from_module(model_args.model, recurse=True)
             force_cpu_offload(model_args.model, model_args.oneshot_device)
         else:
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -16,8 +16,8 @@
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    disable_offloading,
     get_targets_from_modifiers,
-    keep_onload_context,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
@@ -88,7 +88,7 @@ def __call__(
                 prop_desc = f"({layer_index + 1}/{num_layers}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
-                with keep_onload_context():
+                with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
                         inputs = intermediates.fetch(batch_idx)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -28,7 +28,7 @@
     "trace_subgraphs",
     "Subgraph",
     "get_targets_from_modifiers",
-    "keep_onload_context",
+    "disable_offloading",
 ]
 
 
@@ -494,7 +494,11 @@ def is_ancestor(module: Module) -> bool:
 
 
 @contextlib.contextmanager
-def keep_onload_context():
+def disable_offloading():
+    """
+    Keep modules onloaded and disable offloading until this context exits.
+    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
+    """
     original_pre_forward = AlignDevicesHook.pre_forward
     onloaded_modules = dict()
 
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -11,8 +11,8 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    disable_offloading,
     get_targets_from_modifiers,
-    keep_onload_context,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -86,7 +86,7 @@ def __call__(
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
                 # reduce memory movement by keeping modules onloaded
-                with keep_onload_context():
+                with disable_offloading():
                     # do a preliminary pass to trigger modifier hooks
                     for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc):
                         inputs = activations.fetch(batch_idx, subgraph.input_names)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py