remove fallback_to_cpu, use ct utils

kylesayrs · kylesayrs · commit 8e58e35b08ed · 2025-06-13T12:27:43.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -17,7 +17,7 @@
 
 from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments
 from llmcompressor.core import reset_session
-from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     modify_save_pretrained,
     patch_tied_tensors_bug,
@@ -197,20 +197,12 @@ def initialize_model_from_path(
         else model_args.model_name_or_path
     )
 
-    # Fallback to CPU if GPU requested and not available
-    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
-    device_map = model_args.oneshot_device
-    if training_args is not None and training_args.do_train:
-        device_map = "auto"
-
     model_kwargs = {
         "config": config,
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
         "torch_dtype": parse_dtype(model_args.precision),
-        "device_map": device_map,
         "trust_remote_code": model_args.trust_remote_code_model,
     }
 
@@ -220,10 +212,7 @@ def initialize_model_from_path(
             run_compressed=False
         )
 
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        **model_kwargs,
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
     if "sequence_length" in model_kwargs:
         model.seqlen = model_kwargs["sequence_length"]
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,6 +2,7 @@
 
 import torch
 import tqdm
+from compressed_tensors.utils import disable_offloading
 from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 
@@ -15,10 +16,7 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import (
-    disable_offloading,
-    get_targets_from_modifiers,
-)
+from llmcompressor.pipelines.sequential.helpers import get_targets_from_modifiers
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict, List, Optional, Set
 
 import torch
-from accelerate.hooks import AlignDevicesHook
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
 from loguru import logger
@@ -24,12 +23,7 @@
 
 from .ast_helpers import autowrap_forwards
 
-__all__ = [
-    "trace_subgraphs",
-    "Subgraph",
-    "get_targets_from_modifiers",
-    "disable_offloading",
-]
+__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
 
 
 @dataclass
@@ -491,30 +485,3 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
-
-
-@contextlib.contextmanager
-def disable_offloading():
-    """
-    Keep modules onloaded and disable offloading until this context exits.
-    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
-    """
-    original_pre_forward = AlignDevicesHook.pre_forward
-    onloaded_modules = dict()
-
-    # onload once and disable any future onloading/offloading steps
-    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
-        ret = original_pre_forward(self, module, *args, **kwargs)
-        if module not in onloaded_modules:
-            onloaded_modules[module] = (self, self.offload)
-            self.offload = False
-        return ret
-
-    # use the patched pre_forward function within the context
-    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
-        yield
-
-    # manually offload all modules that were onloaded
-    for module, (hook, offload) in onloaded_modules.items():
-        hook.offload = offload
-        hook.post_forward(module, None)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 import torch
-from compressed_tensors.utils import get_execution_device
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
@@ -11,7 +11,6 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
-    disable_offloading,
     get_targets_from_modifiers,
     trace_subgraphs,
 )
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -15,7 +15,6 @@
 
 __all__ = [
     "copy_python_files_from_model_cache",
-    "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
     "get_completed_stages",
@@ -71,22 +70,6 @@ def save_checkpoint(
         compressor.decompress_model(model)
 
 
-def fallback_to_cpu(device: str) -> str:
-    """
-    Takes in a device string and forces it to cpu if cuda is not available
-
-    :param device: device id to check
-    :return: device modified for CUDA status
-    """
-    if "cuda" in device and not torch.cuda.is_available():
-        logger.warning(
-            f"Requested {device} but CUDA is not available, falling back to CPU"
-        )
-        return "cpu"
-
-    return device
-
-
 def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype:
     """
     :param dtype_arg: dtype or string to parse