dispatching

kylesayrs · kylesayrs · commit 382c3e635b49 · 2025-04-08T18:25:06.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -3,6 +3,9 @@
 from pathlib import PosixPath
 from typing import Optional, Tuple
 
+import torch
+from accelerate import dispatch_model
+from accelerate.hooks import remove_hook_from_module
 from loguru import logger
 from torch.nn import Module
 from transformers import (
@@ -27,6 +30,7 @@
 )
 from llmcompressor.typing import Processor
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
+from llmcompressor.utils.offload import has_device_execution
 
 
 def pre_process(model_args: "ModelArguments"):
@@ -65,6 +69,15 @@ def pre_process(model_args: "ModelArguments"):
     # wrap model.save_pretrained
     modify_save_pretrained(model_args.model)
 
+    # dispatch to oneshot device if loaded onto CPU
+    # this needs to be done before qparams are initialized
+    if not has_device_execution(model) and torch.cuda.is_available():
+        model_args.oneshot_device = model_args.oneshot_device or torch.device("cuda:0")
+        remove_hook_from_module(model_args.model, recurse=True)
+        model_args.model = dispatch_model(
+            model_args.model, main_device=model_args.oneshot_device, force_hooks=True
+        )
+
 
 def post_process(
     model_args: Optional["ModelArguments"] = None,
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -16,6 +16,7 @@
 
 from llmcompressor.modifiers.utils.hooks import HooksMixin
 from llmcompressor.utils.helpers import calibration_forward_context, preserve_attr
+from llmcompressor.utils.offload import has_device_parameters
 
 __all__ = ["trace_subgraphs", "Subgraph"]
 
@@ -403,11 +404,12 @@ def get_subgraph_modules(subgraph: Graph, parent_graph: GraphModule) -> List[Mod
 def infer_oneshot_device(
     model: PreTrainedModel, oneshot_device: Optional[torch.device]
 ) -> Optional[torch.device]:
-    if is_gpu_dispatched(model):
+    if has_device_parameters(model):
         logger.warning(
-            "Calibrating a model dispatched to the gpu can potentially lead to OOM "
-            "errors. Consider loading the model without a `device_map` and instead "
-            "executing with `cuda:0` (set `oneshot_device` to override this default)"
+            "Calibrating a model with gpu parameters using the sequential pipeline can "
+            "potentially lead to OOM errors. Consider loading the model without a "
+            "`device_map` and instead executing with `cuda:0` (set `oneshot_device` "
+            "to override this default)"
         )
         return None
 
@@ -417,17 +419,3 @@ def infer_oneshot_device(
         logger.info(f"No oneshot_device passed, using {oneshot_device}")
 
     return oneshot_device
-
-
-def is_gpu_dispatched(model: PreTrainedModel) -> bool:
-    for module in model.modules():
-        if any(
-            param.device not in (torch.device("meta"), torch.device("cpu"))
-            for param in module.parameters()
-        ):
-            return True
-
-        if has_offloaded_params(module) and module._hf_hook.execution_device != "cpu":
-            return True
-
-    return False
diff --git a/src/llmcompressor/utils/offload.py b/src/llmcompressor/utils/offload.py
@@ -0,0 +1,22 @@
+import torch
+from accelerate.utils import has_offloaded_params
+
+__all__ = ["has_device_parameters", "has_device_execution_hook", "has_device_execution"]
+
+
+def has_device_parameters(model: torch.nn.Module) -> bool:
+    return any(
+        param.device not in (torch.device("cpu"), torch.device("meta"))
+        for param in model.parameters()
+    )
+
+
+def has_device_execution_hook(model: torch.nn.Module) -> bool:
+    return any(
+        has_offloaded_params(module) and module._hf_hook.execution_device != "cpu"
+        for module in model.modules()
+    )
+
+
+def has_device_execution(model: torch.nn.Module) -> bool:
+    return has_device_execution_hook(model) or has_device_parameters(model)