appropriate oneshot_device for determinig onloading

kylesayrs · kylesayrs · commit cf09876359ec · 2025-03-27T13:52:02.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -15,7 +15,7 @@
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
 from llmcompressor.args import ModelArguments, TrainingArguments
-from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
     modify_save_pretrained,
     patch_tied_tensors_bug,
@@ -178,20 +178,12 @@ def initialize_model_from_path(
         else model_args.model_name_or_path
     )
 
-    # Fallback to CPU if GPU requested and not available
-    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
-    device_map = model_args.oneshot_device
-    if training_args is not None and training_args.do_train:
-        device_map = "auto"
-
     model_kwargs = {
         "config": config,
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
         "torch_dtype": parse_dtype(model_args.precision),
-        "device_map": device_map,
         "trust_remote_code": model_args.trust_remote_code_model,
     }
 
diff --git a/src/llmcompressor/modifiers/obcq/sgpt_mixin.py b/src/llmcompressor/modifiers/obcq/sgpt_mixin.py
@@ -170,7 +170,8 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
                 state.data.calib,
                 self.sequential_targets,
                 self.ignore,
-                self,
+                None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
             )
             return True
 
@@ -186,7 +187,8 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
                     state.model,
                     state.data.calib,
                     self.sequential_targets,
-                    self,
+                    None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                    self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
                 )
                 return True
 
@@ -200,7 +202,12 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
                     "Falling back to basic pipeline, which requires extra memory and "
                     "may result in decreased accuracy"
                 )
-                run_basic(state.model, state.data.calib, self)
+                run_basic(
+                    state.model,
+                    state.data.calib,
+                    None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                    self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                )
                 return True
 
     def _infer_sequential_targets(
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -236,7 +236,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 state.data.calib,
                 self.sequential_targets,
                 self.ignore,
-                self,
+                None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
             )
             return True
 
@@ -257,7 +258,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                     state.model,
                     state.data.calib,
                     self.sequential_targets,
-                    self,
+                    None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                    self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
                 )
                 return True
 
@@ -272,7 +274,12 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                     "may result in decreased accuracy. Consider using "
                     "`offload_hessians=True`"
                 )
-                run_basic(state.model, state.data.calib, self)
+                run_basic(
+                    state.model,
+                    state.data.calib,
+                    None,  # TODO: pass in oneshot_device argument https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                    self,  # TODO: use callbacks https://github.com/vllm-project/llm-compressor/pull/1279  # noqa: E501
+                )
                 return True
 
     def on_finalize(self, state: State, **kwargs) -> bool:
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -3,7 +3,8 @@
 import torch
 import torch.utils.data.dataloader
 import tqdm
-from compressed_tensors.utils import get_execution_device
+from loguru import logger
+from transformers import PreTrainedModel
 
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
@@ -16,8 +17,9 @@
 
 
 def run_pipeline(
-    model: torch.nn.Module,
+    model: PreTrainedModel,
     dataloader: torch.utils.data.DataLoader,
+    oneshot_device: Optional[torch.device],
     callback_modifier: Optional["Modifier"] = None,
 ):
     """
@@ -32,12 +34,16 @@ def run_pipeline(
     :param dataloader: loads data for calibration
     :param callback_modifier: Temporary HACK which should be replaced by event callback
     """
-    model_device = get_execution_device(model)
+    if oneshot_device is not None:
+        logger.warning(
+            "Basic pipeline does not utilize `oneshot_device` argument, instead use "
+            "`from_pretrained(device_map=...)` to determine onloading behavior"
+        )
 
     with calibration_forward_context(model):
         for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
             batch = apply_pad_mask_to_batch(batch)
-            batch = tensors_to_device(batch, model_device)
+            batch = tensors_to_device(batch, model.device)
             model(**batch)
 
         # TODO: replace with a lifecycle event
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -12,6 +12,7 @@
     maybe_inject_pos_embeddings,
     to_next_layer_kwargs,
 )
+from llmcompressor.pipelines.sequential.helpers import infer_oneshot_device
 from llmcompressor.utils.helpers import align_modules, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -24,6 +25,7 @@ def run_pipeline(
     model: torch.nn.Module,
     dataloader: torch.utils.data.DataLoader,
     sequential_targets: List[str],
+    oneshot_device: Optional[torch.device],
     callback_modifier: Optional["Modifier"] = None,
 ):
     """
@@ -46,8 +48,13 @@ def run_pipeline(
     :param model: model being calibrated
     :param dataloader: loads data for calibration
     :param sequential_targets: patterns which match to the layer modules of the model
+    :param oneshot_device: device to onload layers ontop, uses device_map if None
     :param callback_modifier: Temporary HACK which should be replaced by event callback
     """
+    # if the model is dispatched, use the dispatch to determine onloading, return None
+    # otherwise, infer a oneshot device (either user passed or the first available gpu)
+    oneshot_device = infer_oneshot_device(model, oneshot_device)
+
     # find layers
     layers = match_modules(model, sequential_targets)
 
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -1,10 +1,12 @@
 import inspect
 from collections import deque
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Union
 
+import torch
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
+from loguru import logger
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.proxy import Argument
 from torch.nn import Module
@@ -399,3 +401,33 @@ def get_subgraph_modules(subgraph: Graph, parent_graph: GraphModule) -> List[Mod
     modules_ops: List[Node] = subgraph.find_nodes(op="call_module")
     called_modules = [parent_graph.get_submodule(op.target) for op in modules_ops]
     return list({m for module in called_modules for m in module.modules()})
+
+
+def infer_oneshot_device(
+    model: PreTrainedModel, oneshot_device: Optional[torch.device]
+) -> Optional[torch.device]:
+    if is_gpu_dispatched(model):
+        logger.warning(
+            "Calibrating a model dispatched to the gpu can potentially lead to OOM "
+            "errors. Consider loading the model without a `device_map` and instead "
+            "executing with `cuda:0` (set `oneshot_device` to override this default)"
+        )
+        return None
+
+    elif oneshot_device is None:
+        has_cuda = torch.cuda.is_available()
+        oneshot_device = torch.device("cuda:0") if has_cuda else torch.device("cpu")
+        logger.info(f"No oneshot_device passed, using {oneshot_device}")
+
+    return oneshot_device
+
+
+def is_gpu_dispatched(model: PreTrainedModel) -> bool:
+    for module in model.modules():
+        if any(param.device not in ("meta", "cpu") for param in module.parameters()):
+            return True
+
+        if has_offloaded_params(module) and module._hf_hook.execution_device != "cpu":
+            return True
+
+    return False
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -2,12 +2,15 @@
 
 import torch
 import torch.utils.data.dataloader
-from compressed_tensors.utils import get_execution_device
 from tqdm import tqdm
+from transformers import PreTrainedModel
 
 from llmcompressor.modifiers.utils.hooks import HooksMixin
 from llmcompressor.pipelines.cache import IntermediatesCache
-from llmcompressor.pipelines.sequential.helpers import trace_subgraphs
+from llmcompressor.pipelines.sequential.helpers import (
+    infer_oneshot_device,
+    trace_subgraphs,
+)
 from llmcompressor.utils.helpers import align_modules, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -17,10 +20,11 @@
 
 
 def run_pipeline(
-    model: torch.nn.Module,
+    model: PreTrainedModel,
     dataloader: torch.utils.data.DataLoader,
     sequential_targets: List[str],
     ignore: List[str],
+    oneshot_device: Optional[torch.device],
     callback_modifier: Optional["Modifier"] = None,
 ):
     """
@@ -45,16 +49,22 @@ def run_pipeline(
     :param dataloader: loads data for calibration
     :param sequential_targets: patterns which match to the layer modules of the model
     :param ignore: patterns which match to modules which should be ignored by tracing
+    :param oneshot_device: device to onload layers ontop, uses device_map if None
+    :param callback_modifier: Temporary HACK which should be replaced by event callback
     """
+    # if the model is dispatched, use the dispatch to determine onloading, return None
+    # otherwise, infer a oneshot device (either user passed or the first available gpu)
+    oneshot_device = infer_oneshot_device(model, oneshot_device)
+
     # trace subgraphs
     sample_input = next(iter(dataloader))
     subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
 
-    with calibration_forward_context(model):
-        # prepare intermediates cache
-        model_device = get_execution_device(model)
-        intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
+    # prepare intermediates cache
+    model_device = oneshot_device or model.device
+    intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
 
+    with calibration_forward_context(model):
         num_subgraphs = len(subgraphs)
         for subgraph_index, subgraph in enumerate(subgraphs):
             # prepare tqdm description texts
@@ -64,7 +74,7 @@ def run_pipeline(
             # compile subgraph forward function
             forward_function = subgraph.compile_forward()
 
-            with align_modules(subgraph.modules):
+            with align_modules(subgraph.modules, oneshot_device):
                 # do an preliminary pass to trigger modifier hooks
                 for batch_index in tqdm(range(len(dataloader)), desc=calib_desc):
                     inputs = intermediates.fetch(batch_index, subgraph.input_names)
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -15,7 +15,6 @@
 
 __all__ = [
     "copy_python_files_from_model_cache",
-    "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
     "get_completed_stages",
@@ -41,22 +40,6 @@ def save_checkpoint(
         processor.save_pretrained(save_path)
 
 
-def fallback_to_cpu(device: str) -> str:
-    """
-    Takes in a device string and forces it to cpu if cuda is not available
-
-    :param device: device id to check
-    :return: device modified for CUDA status
-    """
-    if "cuda" in device and not torch.cuda.is_available():
-        logger.warning(
-            f"Requested {device} but CUDA is not available, falling back to CPU"
-        )
-        return "cpu"
-
-    return device
-
-
 def parse_dtype(dtype_arg: Union[str, torch.dtype]) -> torch.dtype:
     """
     :param dtype_arg: dtype or string to parse
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -24,7 +24,7 @@
 import numpy
 import torch
 from compressed_tensors.quantization import disable_quantization, enable_quantization
-from compressed_tensors.utils import has_offloaded_params
+from compressed_tensors.utils import align_module_device
 from loguru import logger
 from transformers import PreTrainedModel
 
@@ -1068,22 +1068,8 @@ def preserve_attr(base: object, attr: str):
 def align_modules(
     modules: Iterable[torch.nn.Module], execution_device: Optional[torch.device] = None
 ):
-    original_devices = {}
-    can_offload = [module for module in modules if has_offloaded_params(module)]
-
-    for module in can_offload:
-        if execution_device is not None:
-            module._hf_hook.execution_device = execution_device
-            original_devices[module] = module._hf_hook.execution_device
-
-        module._hf_hook.pre_forward(module)
-        module._hf_hook.offload = False
-
-    yield
-
-    for module in can_offload:
-        if execution_device is not None:
-            module._hf_hook.execution_device = original_devices[module]
-
-        module._hf_hook.offload = True
-        module._hf_hook.post_forward(module, None)
+    with contextlib.ExitStack() as stack:
+        [
+            stack.enter_context(align_module_device(module, execution_device))
+            for module in modules
+        ]