dispatch in pipelines

kylesayrs · kylesayrs · commit b2367cef328a · 2025-06-16T10:46:54.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -6,7 +6,7 @@
 from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Llama-3.3-70B-Instruct"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -82,7 +82,10 @@ class ModelArguments:
     )
     oneshot_device: Optional[str] = field(
         default="cuda",
-        metadata={"help": "Device to run oneshot calibration on"},
+        metadata={
+            "help": "This argument is deprecated and nonfunctional "
+            "and will be removed in future release"
+        },
     )
     model_revision: str = field(
         default="main",
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,8 +2,6 @@
 from datetime import datetime
 from typing import Optional
 
-import torch
-from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
@@ -125,14 +123,6 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
-        # offload to cpu if possible
-        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            offloaded_dispatch(
-                model_args.model, execution_device=model_args.oneshot_device
-            )
-        else:
-            logger.warning("CUDA is not available! Compressing model on CPU instead")
-
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -9,6 +9,7 @@
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
+from llmcompressor.utils.dev import dispatch_for_generation
 from llmcompressor.utils.helpers import calibration_forward_context
 
 if TYPE_CHECKING:
@@ -37,6 +38,7 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        dispatch_for_generation(model)
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -3,7 +3,6 @@
 import torch
 import tqdm
 from compressed_tensors.utils import disable_offloading
-from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -16,7 +15,10 @@
     to_next_layer_kwargs,
 )
 from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import get_sequential_targets
+from llmcompressor.pipelines.sequential.helpers import (
+    dispatch_for_sequential,
+    get_sequential_targets,
+)
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -56,15 +58,8 @@ def __call__(
         """
         session = active_session()
 
-        # check for offloading
-        if model.device != torch.device("meta"):
-            logger.warning(
-                "Attemping to use sequential pipeline with a model which is not "
-                "offloaded to the cpu. Deploying a model in this way may lead to more "
-                "memory usage than is required. It is recommended to set "
-                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
-                "before compressing"
-            )
+        # prepare model for sequential onloading
+        dispatch_for_sequential(model)
 
         # find layers
         modifiers = session.get_modifiers()
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,8 +5,9 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
-from compressed_tensors import has_offloaded_params
+from accelerate.hooks import remove_hook_from_module
 from compressed_tensors.quantization import find_name_or_class_matches
+from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch
 from loguru import logger
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.graph import PythonCode
@@ -26,7 +27,12 @@
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
 
-__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"]
+__all__ = [
+    "trace_subgraphs",
+    "Subgraph",
+    "get_sequential_targets",
+    "dispatch_for_sequential",
+]
 
 
 @dataclass
@@ -503,3 +509,22 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
+
+
+def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
+    """
+    Dispatch a model for sequential calibration using a sequential pipeline.
+    The model will be offloaded to the CPU and dispatched to CUDA device if available.
+    Removes any existing hooks.
+
+    :param model: model to dispatch
+    :return: dispatched model
+    """
+    remove_hook_from_module(model, recurse=True)
+
+    if torch.cuda.is_available():
+        offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
+    else:
+        logger.warning("CUDA is not available! Compressing model on CPU instead")
+
+    return model
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -2,7 +2,6 @@
 
 import torch
 from compressed_tensors.utils import disable_offloading, get_execution_device
-from loguru import logger
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -11,6 +10,7 @@
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
+    dispatch_for_sequential,
     get_sequential_targets,
     trace_subgraphs,
 )
@@ -52,15 +52,8 @@ def __call__(
         """
         session = active_session()
 
-        # check for offloading
-        if model.device != torch.device("meta"):
-            logger.warning(
-                "Attemping to use sequential pipeline with a model which is not "
-                "offloaded to the cpu. Deploying a model in this way may lead to more "
-                "memory usage than is required. It is recommended to set "
-                '`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
-                "before compressing"
-            )
+        # prepare model for sequential onloading
+        dispatch_for_sequential(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,10 @@ class ModelArguments:`
`82`	`82`	`)`
`83`	`83`	`oneshot_device: Optional[str] = field(`
`84`	`84`	`default="cuda",`
`85`		`- metadata={"help": "Device to run oneshot calibration on"},`
	`85`	`+ metadata={`
	`86`	`+ "help": "This argument is deprecated and nonfunctional "`
	`87`	`+ "and will be removed in future release"`
	`88`	`+ },`
`86`	`89`	`)`
`87`	`90`	`model_revision: str = field(`
`88`	`91`	`default="main",`