Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 1 addition & 19 deletions src/llmcompressor/pipelines/sequential/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import torch
from accelerate.hooks import remove_hook_from_module
from compressed_tensors.offload import disable_onloading, offload_model
from compressed_tensors.offload import disable_onloading
from compressed_tensors.utils import patch_attr
from compressed_tensors.utils.match import match_named_modules
from loguru import logger
Expand All @@ -35,7 +35,6 @@
"trace_subgraphs",
"Subgraph",
"get_sequential_targets",
"dispatch_for_sequential",
"handle_sequential_oom",
]

Expand Down Expand Up @@ -516,23 +515,6 @@ def is_ancestor(module: Module) -> bool:
return ancestors


def dispatch_for_sequential(
model: PreTrainedModel,
onload_device: Optional[torch.device | str] = None,
offload_device: Optional[torch.device | str] = None,
) -> PreTrainedModel:
"""
Dispatch a model for sequential calibration using a sequential pipeline.
The model will be offloaded to the CPU and dispatched to CUDA/XPU device
if available. Removes any existing hooks.

:param model: model to dispatch
:return: dispatched model
"""
if onload_device is None:
onload_device = get_main_device()
return offload_model(model, onload_device, offload_device)


def _get_autowrap_functions() -> tuple[Callable[[Any], Any], ...]:
try:
Expand Down
4 changes: 2 additions & 2 deletions src/llmcompressor/pipelines/sequential/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from llmcompressor.modifiers.utils.hooks import HooksMixin
from llmcompressor.pipelines.cache import IntermediatesCache
from llmcompressor.pipelines.registry import CalibrationPipeline
from compressed_tensors.offload import set_onload_device
from llmcompressor.pipelines.sequential.helpers import (
dispatch_for_sequential,
get_sequential_targets,
handle_sequential_oom,
trace_subgraphs,
Expand Down Expand Up @@ -89,7 +89,7 @@ def __call__(
# prepare model for sequential onloading
onload_device = get_main_device()
offload_device = torch.device(dataset_args.sequential_offload_device)
dispatch_for_sequential(model, onload_device)
set_onload_device(model, onload_device)

# prepare to trace subgraphs
modifiers = session.lifecycle.recipe.modifiers
Expand Down
4 changes: 2 additions & 2 deletions tests/llmcompressor/utils/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
import torch
from compressed_tensors.offload import dispatch_model, offload_model
from compressed_tensors.offload import dispatch_model, set_onload_device
from transformers import (
AutoModelForCausalLM,
MllamaForConditionalGeneration,
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_disable_cache(model_cls, model_stub):
def test_disable_lm_head(offload):
model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
if offload == "sequential":
offload_model(model, "cuda")
set_onload_device(model, "cuda")
if offload == "basic":
dispatch_model(model)
if offload == "none":
Expand Down
Loading