Skip to content

Commit b2367ce

Browse files
committed
dispatch in pipelines
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 4bb86e5 commit b2367ce

File tree

7 files changed

+43
-35
lines changed

7 files changed

+43
-35
lines changed

examples/quantization_w4a16/llama3_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from llmcompressor.utils.dev import dispatch_for_generation
77

88
# Select model and load it.
9-
model_id = "meta-llama/Llama-3.3-70B-Instruct"
9+
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
1010
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
1111
tokenizer = AutoTokenizer.from_pretrained(model_id)
1212

src/llmcompressor/args/model_arguments.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,10 @@ class ModelArguments:
8282
)
8383
oneshot_device: Optional[str] = field(
8484
default="cuda",
85-
metadata={"help": "Device to run oneshot calibration on"},
85+
metadata={
86+
"help": "This argument is deprecated and nonfunctional "
87+
"and will be removed in future release"
88+
},
8689
)
8790
model_revision: str = field(
8891
default="main",

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
from datetime import datetime
33
from typing import Optional
44

5-
import torch
6-
from compressed_tensors.utils import offloaded_dispatch
75
from loguru import logger
86
from torch.utils.data import DataLoader
97
from transformers import PreTrainedModel
@@ -125,14 +123,6 @@ def __init__(
125123
# initialize the model and processor
126124
pre_process(model_args)
127125

128-
# offload to cpu if possible
129-
if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
130-
offloaded_dispatch(
131-
model_args.model, execution_device=model_args.oneshot_device
132-
)
133-
else:
134-
logger.warning("CUDA is not available! Compressing model on CPU instead")
135-
136126
# Set instance attributes
137127
self.model = self.model_args.model
138128
self.processor = self.model_args.processor

src/llmcompressor/pipelines/basic/pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
1010
from llmcompressor.pipelines.registry import CalibrationPipeline
1111
from llmcompressor.pytorch.utils.helpers import tensors_to_device
12+
from llmcompressor.utils.dev import dispatch_for_generation
1213
from llmcompressor.utils.helpers import calibration_forward_context
1314

1415
if TYPE_CHECKING:
@@ -37,6 +38,7 @@ def __call__(
3738
:param dataloader: loads data for calibration
3839
:param dataset_args: dataset arguments relevant to pipelines
3940
"""
41+
dispatch_for_generation(model)
4042
model_device = get_execution_device(model)
4143

4244
LifecycleCallbacks.calibration_epoch_start()

src/llmcompressor/pipelines/layer_sequential/pipeline.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import torch
44
import tqdm
55
from compressed_tensors.utils import disable_offloading
6-
from loguru import logger
76
from torch.utils.data.dataloader import DataLoader
87

98
from llmcompressor.core import LifecycleCallbacks, active_session
@@ -16,7 +15,10 @@
1615
to_next_layer_kwargs,
1716
)
1817
from llmcompressor.pipelines.registry import CalibrationPipeline
19-
from llmcompressor.pipelines.sequential.helpers import get_sequential_targets
18+
from llmcompressor.pipelines.sequential.helpers import (
19+
dispatch_for_sequential,
20+
get_sequential_targets,
21+
)
2022
from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
2123

2224
if TYPE_CHECKING:
@@ -56,15 +58,8 @@ def __call__(
5658
"""
5759
session = active_session()
5860

59-
# check for offloading
60-
if model.device != torch.device("meta"):
61-
logger.warning(
62-
"Attemping to use sequential pipeline with a model which is not "
63-
"offloaded to the cpu. Deploying a model in this way may lead to more "
64-
"memory usage than is required. It is recommended to set "
65-
'`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
66-
"before compressing"
67-
)
61+
# prepare model for sequential onloading
62+
dispatch_for_sequential(model)
6863

6964
# find layers
7065
modifiers = session.get_modifiers()

src/llmcompressor/pipelines/sequential/helpers.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
66

77
import torch
8-
from compressed_tensors import has_offloaded_params
8+
from accelerate.hooks import remove_hook_from_module
99
from compressed_tensors.quantization import find_name_or_class_matches
10+
from compressed_tensors.utils import has_offloaded_params, offloaded_dispatch
1011
from loguru import logger
1112
from torch.fx import Graph, GraphModule, Node
1213
from torch.fx.graph import PythonCode
@@ -26,7 +27,12 @@
2627
if TYPE_CHECKING:
2728
from llmcompressor.args.dataset_arguments import DatasetArguments
2829

29-
__all__ = ["trace_subgraphs", "Subgraph", "get_sequential_targets"]
30+
__all__ = [
31+
"trace_subgraphs",
32+
"Subgraph",
33+
"get_sequential_targets",
34+
"dispatch_for_sequential",
35+
]
3036

3137

3238
@dataclass
@@ -503,3 +509,22 @@ def is_ancestor(module: Module) -> bool:
503509

504510
is_ancestor(model)
505511
return ancestors
512+
513+
514+
def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
515+
"""
516+
Dispatch a model for sequential calibration using a sequential pipeline.
517+
The model will be offloaded to the CPU and dispatched to CUDA device if available.
518+
Removes any existing hooks.
519+
520+
:param model: model to dispatch
521+
:return: dispatched model
522+
"""
523+
remove_hook_from_module(model, recurse=True)
524+
525+
if torch.cuda.is_available():
526+
offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
527+
else:
528+
logger.warning("CUDA is not available! Compressing model on CPU instead")
529+
530+
return model

src/llmcompressor/pipelines/sequential/pipeline.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import torch
44
from compressed_tensors.utils import disable_offloading, get_execution_device
5-
from loguru import logger
65
from torch.utils.data.dataloader import DataLoader
76
from tqdm import tqdm
87

@@ -11,6 +10,7 @@
1110
from llmcompressor.pipelines.cache import IntermediatesCache
1211
from llmcompressor.pipelines.registry import CalibrationPipeline
1312
from llmcompressor.pipelines.sequential.helpers import (
13+
dispatch_for_sequential,
1414
get_sequential_targets,
1515
trace_subgraphs,
1616
)
@@ -52,15 +52,8 @@ def __call__(
5252
"""
5353
session = active_session()
5454

55-
# check for offloading
56-
if model.device != torch.device("meta"):
57-
logger.warning(
58-
"Attemping to use sequential pipeline with a model which is not "
59-
"offloaded to the cpu. Deploying a model in this way may lead to more "
60-
"memory usage than is required. It is recommended to set "
61-
'`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
62-
"before compressing"
63-
)
55+
# prepare model for sequential onloading
56+
dispatch_for_sequential(model)
6457

6558
# prepare to trace subgraphs
6659
modifiers = session.get_modifiers()

0 commit comments

Comments
 (0)