Skip to content

Commit cf1f87d

Browse files
committed
implement oneshot_device, pipeline warnings
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 6705bf4 commit cf1f87d

File tree

6 files changed

+38
-44
lines changed

6 files changed

+38
-44
lines changed

src/llmcompressor/args/model_arguments.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class ModelArguments:
8181
metadata={"help": "Whether to compress sparse models during save"},
8282
)
8383
oneshot_device: Optional[str] = field(
84-
default="cuda:0",
84+
default="cuda",
8585
metadata={"help": "Device to run oneshot calibration on"},
8686
)
8787
model_revision: str = field(

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
from datetime import datetime
33
from typing import Optional
44

5-
import torch
6-
from compressed_tensors import force_cpu_offload
7-
from compressed_tensors.utils import get_execution_device
85
from loguru import logger
96
from torch.utils.data import DataLoader
107
from transformers import PreTrainedModel
@@ -13,11 +10,7 @@
1310
from llmcompressor.core.session_functions import active_session
1411
from llmcompressor.datasets import get_calibration_dataloader
1512
from llmcompressor.entrypoints.utils import post_process, pre_process
16-
from llmcompressor.pipelines import (
17-
CalibrationPipeline,
18-
LayerSequentialPipeline,
19-
SequentialPipeline,
20-
)
13+
from llmcompressor.pipelines import CalibrationPipeline
2114

2215
__all__ = ["Oneshot", "oneshot"]
2316

@@ -193,35 +186,6 @@ def apply_recipe_modifiers(
193186
user_pipeline = self.dataset_args.pipeline
194187
modifiers = session.get_modifiers()
195188
pipeline = CalibrationPipeline.from_modifiers(modifiers, user=user_pipeline)
196-
197-
model_exec_device = get_execution_device(self.model)
198-
199-
# Sequential pipelines onload models layer by layer to minimize GPU memory usage
200-
if isinstance(pipeline, (SequentialPipeline, LayerSequentialPipeline)):
201-
# unless pure cpu run, throw warning if model lives on oneshot_device
202-
if (
203-
model_exec_device
204-
== self.model_args.oneshot_device
205-
!= torch.device("cpu")
206-
):
207-
logger.warning(
208-
f"Model device {model_exec_device} is the same as oneshot"
209-
" execution device. If you encounter OOM errors, consider"
210-
" loading the model up on CPU, so that more memory is available"
211-
" for the oneshot algorithm to run on GPU. Example available at"
212-
" examples/quantization_w4a16/llama3_example.py"
213-
)
214-
215-
# set cpu offload for model
216-
elif (
217-
model_exec_device
218-
== torch.device("cpu")
219-
!= self.model_args.oneshot_device
220-
):
221-
force_cpu_offload(
222-
self.model, execution_devce=self.model_args.oneshot_device
223-
)
224-
225189
pipeline(self.model, calibration_dataloader, self.dataset_args)
226190

227191
session.finalize()

src/llmcompressor/entrypoints/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from pathlib import PosixPath
44
from typing import Optional, Tuple
55

6+
import torch
7+
from compressed_tensors.utils import force_cpu_offload
68
from loguru import logger
79
from torch.nn import Module
810
from transformers import (
@@ -62,6 +64,16 @@ def pre_process(model_args: "ModelArguments"):
6264
# untie tie_word_embeddings weights
6365
patch_tied_tensors_bug(model_args.model)
6466

67+
# offload to cpu if possible
68+
if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
69+
# TODO: consider renaming function to something like "offload_dispatch_model"
70+
# TODO: modify function to remove any hooks if they already exist (making sure
71+
# to move to cpu when removing hook
72+
force_cpu_offload(model_args.model, model_args.oneshot_device)
73+
74+
else:
75+
logger.warning("CUDA is not available! Compressing model on CPU instead")
76+
6577
# wrap model.save_pretrained
6678
modify_save_pretrained(model_args.model)
6779

src/llmcompressor/pipelines/layer_sequential/pipeline.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import torch
44
import tqdm
5+
from loguru import logger
56
from torch.utils.data.dataloader import DataLoader
67

78
from llmcompressor.core import LifecycleCallbacks, active_session
@@ -57,6 +58,16 @@ def __call__(
5758
"""
5859
session = active_session()
5960

61+
# check for offloading
62+
if model.device != torch.device("meta"):
63+
logger.warning(
64+
"Attemping to use sequential pipeline with a model which is not "
65+
"offloaded to the cpu. Deploying a model in this way may lead to more "
66+
"memory usage than is required. It is recommended to set "
67+
'`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
68+
"before compressing"
69+
)
70+
6071
# find layers
6172
modifiers = session.get_modifiers()
6273
sequential_targets, _ = get_targets_from_modifiers(modifiers, model)

src/llmcompressor/pipelines/registry.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
__all__ = ["CalibrationPipeline"]
1919

2020
SEQUENTIAL_MODIFIERS = (AWQModifier, GPTQModifier, SparsityModifierBase)
21+
NEED_DATA = (SmoothQuantModifier, *SEQUENTIAL_MODIFIERS)
2122

2223

2324
class CalibrationPipeline(ABC, RegistryMixin):
@@ -60,7 +61,7 @@ def from_modifiers(
6061

6162
@staticmethod
6263
def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
63-
if any(isinstance(modifier, SEQUENTIAL_MODIFIERS) for modifier in modifiers):
64+
if any(isinstance(modifier, NEED_DATA) for modifier in modifiers):
6465
return "sequential"
6566

6667
active_qmods = _get_active_quant_modifiers(modifiers)
@@ -76,11 +77,6 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
7677
config = quant_modifier.resolve_quantization_config()
7778
if config.requires_calibration_data():
7879
return "sequential"
79-
else:
80-
return "datafree"
81-
82-
if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers):
83-
return "sequential"
8480

8581
return "datafree"
8682

src/llmcompressor/pipelines/sequential/pipeline.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import torch
44
from compressed_tensors.utils import get_execution_device
5+
from loguru import logger
56
from torch.utils.data.dataloader import DataLoader
67
from tqdm import tqdm
78

@@ -52,6 +53,16 @@ def __call__(
5253
"""
5354
session = active_session()
5455

56+
# check for offloading
57+
if model.device != torch.device("meta"):
58+
logger.warning(
59+
"Attemping to use sequential pipeline with a model which is not "
60+
"offloaded to the cpu. Deploying a model in this way may lead to more "
61+
"memory usage than is required. It is recommended to set "
62+
'`oneshot_device="cuda"` or call `force_cpu_offload` on your model '
63+
"before compressing"
64+
)
65+
5566
# prepare to trace subgraphs
5667
modifiers = session.get_modifiers()
5768
sequential_targets = get_targets_from_modifiers(modifiers, model)

0 commit comments

Comments
 (0)