Skip to content

Commit bd8db69

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/testing-device-map
2 parents 356e63c + 50bb656 commit bd8db69

File tree

19 files changed

+196
-44
lines changed

19 files changed

+196
-44
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
Big updates have landed in LLM Compressor! Check out these exciting new features:
1818

1919
* **Preliminary FP4 Quantization Support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [weight-only quantization](examples/quantization_w4a16_fp4/llama3_example.py) and [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py). Support is currently preliminary and additional support will be added for MoEs.
20-
* **Axolotl Sparse Finetuning Integration:** Easily finetune sparse LLMs through our seamless integration with Axolotl. [Learn more here](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
20+
* **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
2121
* **AutoAWQ Integration:** Perform low-bit weight-only quantization efficiently using AutoAWQ, now part of LLM Compressor. *Note: This integration should be considered experimental for now. Enhanced support, including for MoE models and improved handling of larger models via layer sequential pipelining, is planned for upcoming releases.* [See the details](https://github.com/vllm-project/llm-compressor/pull/1177).
2222
* **Day 0 Llama 4 Support:** Meta utilized LLM Compressor to create the [FP8-quantized Llama-4-Maverick-17B-128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8), optimized for vLLM inference using [compressed-tensors](https://github.com/neuralmagic/compressed-tensors) format.
2323

examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
from pathlib import Path
2+
13
import torch
24
from loguru import logger
35
from transformers import AutoModelForCausalLM, AutoTokenizer
46

57
from llmcompressor import oneshot, train
6-
from llmcompressor.utils import dispatch_for_generation
78

89
# load the model in as bfloat16 to save on memory and compute
910
model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
@@ -18,6 +19,7 @@
1819

1920
# save location of quantized model
2021
output_dir = "output_llama7b_2of4_w4a16_channel"
22+
output_path = Path(output_dir)
2123

2224
# set dataset config parameters
2325
splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
@@ -63,25 +65,26 @@
6365
# ./output_llama7b_2of4_w4a16_channel/ + (finetuning/sparsity/quantization)_stage
6466

6567
# Oneshot sparsification
66-
oneshot_applied_model = oneshot(
68+
69+
oneshot(
6770
model=model,
6871
**oneshot_kwargs,
72+
output_dir=output_dir,
6973
stage="sparsity_stage",
7074
)
7175

7276
# Sparse finetune
73-
dispatch_for_generation(model)
74-
finetune_applied_model = train(
75-
model=oneshot_applied_model,
77+
train(
78+
model=(output_path / "sparsity_stage"),
7679
**oneshot_kwargs,
7780
**training_kwargs,
81+
output_dir=output_dir,
7882
stage="finetuning_stage",
7983
)
8084

8185
# Oneshot quantization
82-
model.to("cpu")
8386
quantized_model = oneshot(
84-
model=finetune_applied_model,
87+
model=(output_path / "finetuning_stage"),
8588
**oneshot_kwargs,
8689
stage="quantization_stage",
8790
)

examples/quantization_w8a8_int8/gemma2_example.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ def tokenize(sample):
6767
# NOTE: transformers 4.49.0 results in a generation error with gemma2.
6868
# Consider either downgrading your transformers version to a previous version
6969
# or use vLLM for sample generation.
70+
# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
7071
print("========== SAMPLE GENERATION ==============")
7172
dispatch_for_generation(model)
7273
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
73-
output = model.generate(input_ids, max_new_tokens=20)
74+
output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
7475
print(tokenizer.decode(output[0]))
7576
print("==========================================")
7677

src/llmcompressor/args/dataset_arguments.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
171171
"will execute code present on the Hub on your local machine."
172172
},
173173
)
174+
# --- pipeline arguments --- #
174175
pipeline: Optional[str] = field(
175176
default="independent",
176177
metadata={

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 120 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import os
22
from datetime import datetime
3-
from typing import Optional
3+
from typing import TYPE_CHECKING, List, Optional, Union
44

55
from loguru import logger
66
from torch.utils.data import DataLoader
7-
from transformers import PreTrainedModel
7+
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
88

99
from llmcompressor.args import parse_args
1010
from llmcompressor.core.session_functions import active_session
@@ -14,6 +14,9 @@
1414

1515
__all__ = ["Oneshot", "oneshot"]
1616

17+
if TYPE_CHECKING:
18+
from datasets import Dataset, DatasetDict
19+
1720

1821
class Oneshot:
1922
"""
@@ -102,7 +105,8 @@ def __init__(
102105
:param recipe_args: RecipeArguments parameters, responsible for containing
103106
recipe-related parameters
104107
:param output_dir: Path to save the output model after carrying out oneshot
105-
108+
:param log_dir: Path to save logs during oneshot run.
109+
Nothing is logged to file if None.
106110
"""
107111
# Set up logging
108112
if log_dir:
@@ -191,8 +195,119 @@ def apply_recipe_modifiers(
191195
session.finalize()
192196

193197

194-
def oneshot(**kwargs) -> PreTrainedModel:
195-
one_shot = Oneshot(**kwargs)
198+
def oneshot(
199+
# Model arguments
200+
model: Union[str, PreTrainedModel],
201+
distill_teacher: Optional[str] = None,
202+
config_name: Optional[str] = None,
203+
tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None,
204+
processor: Optional[Union[str, ProcessorMixin]] = None,
205+
cache_dir: Optional[str] = None,
206+
use_auth_token: bool = False,
207+
precision: str = "auto",
208+
tie_word_embeddings: bool = False,
209+
trust_remote_code_model: bool = False,
210+
save_compressed: bool = True,
211+
oneshot_device: str = "cuda:0",
212+
model_revision: str = "main",
213+
# Recipe arguments
214+
recipe: Optional[Union[str, List[str]]] = None,
215+
recipe_args: Optional[List[str]] = None,
216+
clear_sparse_session: bool = False,
217+
stage: Optional[str] = None,
218+
# Dataset arguments
219+
dataset: Optional[Union[str, "Dataset", "DatasetDict"]] = None,
220+
dataset_config_name: Optional[str] = None,
221+
dataset_path: Optional[str] = None,
222+
num_calibration_samples: int = 512,
223+
shuffle_calibration_samples: bool = True,
224+
max_seq_length: int = 384,
225+
pad_to_max_length: bool = True,
226+
text_column: str = "text",
227+
concatenate_data: bool = False,
228+
streaming: bool = False,
229+
overwrite_cache: bool = False,
230+
preprocessing_num_workers: Optional[int] = None,
231+
min_tokens_per_module: Optional[float] = None,
232+
trust_remote_code_data: bool = False,
233+
# Miscellaneous arguments
234+
output_dir: Optional[str] = None,
235+
log_dir: Optional[str] = "sparse_logs",
236+
**kwargs,
237+
) -> PreTrainedModel:
238+
"""
239+
Performs oneshot calibration on a model.
240+
241+
# Model arguments
242+
:param model: A pretrained model identifier from huggingface.co/models or a path
243+
to a local model. Required parameter.
244+
:param distill_teacher: Teacher model (a trained text generation model)
245+
for distillation.
246+
:param config_name: Pretrained config name or path if not the same as
247+
model_name.
248+
:param tokenizer: Pretrained tokenizer name or path if not the same as
249+
model_name.
250+
:param processor: Pretrained processor name or path if not the same as
251+
model_name.
252+
:param cache_dir: Where to store the pretrained data from
253+
huggingface.co.
254+
:param use_auth_token: Whether to use Hugging Face auth token for private
255+
models.
256+
:param precision: Precision to cast model weights to, default to auto.
257+
:param tie_word_embeddings: Whether the model's input and output word embeddings
258+
should be tied.
259+
:param trust_remote_code_model: Whether to allow for custom models to execute
260+
their own modeling files.
261+
:param save_compressed: Whether to compress sparse models during save.
262+
:param oneshot_device: Device to run oneshot calibration on.
263+
:param model_revision: The specific model version to use (can be branch name,
264+
tag, or commit id).
265+
266+
# Recipe arguments
267+
:param recipe: Path to a LLM Compressor sparsification recipe.
268+
:param recipe_args: List of recipe arguments to evaluate, in the
269+
format "key1=value1", "key2=value2".
270+
:param clear_sparse_session: Whether to clear CompressionSession/
271+
CompressionLifecycle data between runs.
272+
:param stage: The stage of the recipe to use for oneshot.
273+
274+
# Dataset arguments
275+
:param dataset: The name of the dataset to use (via the datasets
276+
library).
277+
:param dataset_config_name: The configuration name of the dataset
278+
to use.
279+
:param dataset_path: Path to a custom dataset. Supports json, csv, dvc.
280+
:param num_calibration_samples: Number of samples to use for one-shot
281+
calibration.
282+
:param shuffle_calibration_samples: Whether to shuffle the dataset before
283+
calibration.
284+
:param max_seq_length: Maximum total input sequence length after tokenization.
285+
:param pad_to_max_length: Whether to pad all samples to `max_seq_length`.
286+
:param text_column: Key to use as the `text` input to tokenizer/processor.
287+
:param concatenate_data: Whether to concatenate datapoints to fill
288+
max_seq_length.
289+
:param streaming: True to stream data from a cloud dataset.
290+
:param overwrite_cache: Whether to overwrite the cached preprocessed datasets.
291+
:param preprocessing_num_workers: Number of processes for
292+
preprocessing.
293+
:param min_tokens_per_module: Minimum percentage of tokens per
294+
module, relevant for MoE models.
295+
:param trust_remote_code_data: Whether to allow for datasets defined on the Hub
296+
using a dataset script.
297+
298+
# Miscellaneous arguments
299+
:param output_dir: Path to save the output model after calibration.
300+
Nothing is saved if None.
301+
:param log_dir: Path to save logs during oneshot run.
302+
Nothing is logged to file if None.
303+
304+
:return: The calibrated PreTrainedModel
305+
"""
306+
307+
# pass all args directly into Oneshot
308+
local_args = locals()
309+
local_args.pop("kwargs")
310+
one_shot = Oneshot(**local_args, **kwargs)
196311
one_shot()
197312

198313
return one_shot.model

src/llmcompressor/modifiers/utils/helpers.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,11 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
7272
)
7373
).reshape([1])
7474

75-
update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
76-
update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
77-
update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
78-
del global_scale
75+
update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
76+
update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
77+
update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
78+
79+
del global_scale
7980

8081
if _is_mlp_module(submodule):
8182
if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
@@ -91,10 +92,7 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
9192
)
9293
).reshape([1])
9394

94-
update_parameter_data(
95-
submodule.gate_proj, global_scale, "weight_global_scale"
96-
)
97-
update_parameter_data(
98-
submodule.up_proj, global_scale, "weight_global_scale"
99-
)
100-
del global_scale
95+
update_parameter_data(submodule.gate_proj, global_scale, "weight_global_scale")
96+
update_parameter_data(submodule.up_proj, global_scale, "weight_global_scale")
97+
98+
del global_scale

src/llmcompressor/pipelines/basic/pipeline.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
1010
from llmcompressor.pipelines.registry import CalibrationPipeline
1111
from llmcompressor.pytorch.utils.helpers import tensors_to_device
12-
from llmcompressor.utils.dev import dispatch_for_generation
13-
from llmcompressor.utils.helpers import calibration_forward_context
12+
from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
1413

1514
if TYPE_CHECKING:
1615
from llmcompressor.args.dataset_arguments import DatasetArguments

src/llmcompressor/pipelines/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def empty(cls, num_batches: int, offload_device: torch.device):
6161
def from_dataloader(
6262
cls,
6363
dataloader: torch.utils.data.DataLoader,
64-
model_device: torch.device,
64+
model_device: torch.device = torch.device("cpu"),
6565
mask_padding: bool = True,
6666
offload_device: Optional[torch.device] = torch.device("cpu"),
6767
):

src/llmcompressor/pipelines/layer_sequential/helpers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import torch
77
import tqdm
88
from compressed_tensors.quantization import find_name_or_class_matches
9-
from compressed_tensors.utils import get_execution_device
109
from torch.nn import Module
1110
from torch.utils.data.dataloader import DataLoader
1211

@@ -45,6 +44,7 @@ def capture_first_layer_intermediates(
4544
model: Module,
4645
first_layer: Module,
4746
dataloader: DataLoader,
47+
model_device: torch.device = torch.device("cpu"),
4848
mask_padding: bool = True,
4949
) -> IntermediatesCache:
5050
"""
@@ -62,7 +62,6 @@ def capture_first_layer_intermediates(
6262
:param mask_padding: zero out padding tokens if True. This affects modifiers such as
6363
GPTQ and SparseGPT
6464
"""
65-
model_device = get_execution_device(model)
6665
intermediates = IntermediatesCache.empty(len(dataloader), torch.device("cpu"))
6766
signature = inspect.signature(first_layer.forward)
6867

src/llmcompressor/pipelines/layer_sequential/pipeline.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import torch
44
import tqdm
5-
from compressed_tensors.utils import disable_offloading
5+
from compressed_tensors.utils import disable_offloading, get_execution_device
66
from torch.utils.data.dataloader import DataLoader
77

88
from llmcompressor.core import LifecycleCallbacks, active_session
@@ -60,6 +60,7 @@ def __call__(
6060

6161
# prepare model for sequential onloading
6262
dispatch_for_sequential(model)
63+
model_device = get_execution_device(model)
6364

6465
# find layers
6566
modifiers = session.get_modifiers()
@@ -71,7 +72,7 @@ def __call__(
7172
with calibration_forward_context(model), DisableQuantization(model):
7273
# prepare intermediates cache
7374
intermediates: IntermediatesCache = capture_first_layer_intermediates(
74-
model, layers[0], dataloader
75+
model, layers[0], dataloader, model_device
7576
)
7677

7778
num_layers = len(layers)

0 commit comments

Comments
 (0)