Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions examples/disk_offloading/kimi_k2_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from compressed_tensors.offload import get_device_map, load_offloaded_model
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

# Select model and load it in the `load_offloaded_model` context
with load_offloaded_model():
model_id = "unsloth/Kimi-K2-Instruct-0905-BF16"
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype="auto",
device_map="auto_offload", # fit as much as possible on cpu, rest goes on disk
trust_remote_code=True,
offload_folder="./offload_folder",
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Confirm that model is dispatched correctly
devices = {offloaded for _onloaded, offloaded in get_device_map(model).values()}
print(f"Model was offloaded to the following devices: {devices}")

# Select calibration dataset.
DATASET_ID = "ultrachat-200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 20
MAX_SEQUENCE_LENGTH = 2048

# Configure the quantization algorithm to run.
# * quantize the weights to NVFP4
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

# Apply algorithms.
oneshot(
model=model,
processor=tokenizer,
dataset=DATASET_ID,
splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save to disk compressed.
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
65 changes: 65 additions & 0 deletions examples/disk_offloading/qwen3_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from compressed_tensors.offload import (
dispatch_model,
get_device_map,
load_offloaded_model,
)
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

# Select model and load it in the `load_offloaded_model` context
# In this example, we emulate large model quantization with disk offloading by
# restricting the theoretical size of CPU RAM to be smaller than the size of the model
with load_offloaded_model():
model_id = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype="auto",
device_map="auto_offload", # fit as much as possible on cpu, rest goes on disk
max_memory={"cpu": 6e8}, # remove this line to use as much cpu as possible
offload_folder="./offload_folder",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Confirm that model is dispatched correctly
devices = {offloaded for _onloaded, offloaded in get_device_map(model).values()}
print(f"Model was offloaded to the following devices: {devices}")

# Select calibration dataset.
DATASET_ID = "ultrachat-200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 20
MAX_SEQUENCE_LENGTH = 2048

# Configure the quantization algorithm to run.
# * quantize the weights to NVFP4
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

# Apply algorithms.
oneshot(
model=model,
dataset=DATASET_ID,
splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to(model.device) for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
10 changes: 5 additions & 5 deletions src/llmcompressor/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os
from pathlib import PosixPath

from compressed_tensors.utils import remove_dispatch
from compressed_tensors.offload import from_accelerate
from loguru import logger
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -84,6 +84,10 @@ def pre_process(
if not model_args.tie_word_embeddings:
untie_word_embeddings(model_args.model)

# if the model was loaded with accelerate offloading, convert to CT offloading
if hasattr(model_args.model, "hf_device_map"):
from_accelerate(model_args.model)

# wrap model.save_pretrained
modify_save_pretrained(model_args.model)

Expand All @@ -104,10 +108,6 @@ def post_process(
Raises:
ValueError: If saving fails due to an invalid `output_dir` or other issues.
"""
# remove any existing dispatches
if model_args is not None and model_args.model is not None:
remove_dispatch(model_args.model)

if model_args is not None and output_dir is not None:
if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
output_dir = os.path.join(output_dir, recipe_args.stage)
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/pipelines/sequential/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ def is_ancestor(module: Module) -> bool:
def dispatch_for_sequential(
model: PreTrainedModel,
onload_device: Optional[torch.device | str] = None,
offload_device: torch.device | str = torch.device("cpu"),
offload_device: Optional[torch.device | str] = None,
) -> PreTrainedModel:
"""
Dispatch a model for sequential calibration using a sequential pipeline.
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/pipelines/sequential/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __call__(
# prepare model for sequential onloading
onload_device = get_main_device()
offload_device = torch.device(dataset_args.sequential_offload_device)
dispatch_for_sequential(model, onload_device, offload_device)
dispatch_for_sequential(model, onload_device)

# prepare to trace subgraphs
modifiers = session.lifecycle.recipe.modifiers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
SparsityCompressionConfig,
)
from compressed_tensors.config import CompressionFormat
from compressed_tensors.offload import is_rank0
from compressed_tensors.offload import is_rank0, to_accelerate
from loguru import logger
from transformers import PreTrainedModel

Expand Down Expand Up @@ -90,6 +90,9 @@ def save_pretrained_wrapper(
compressor.compress_model(model)

if is_rank0():
# convert to accelerate offloaded for optimal saving with transformers
to_accelerate(model)

# save (compressed) model structure
original_save_pretrained.__get__(model, model_class)(
save_directory,
Expand Down
32 changes: 17 additions & 15 deletions tests/llmcompressor/transformers/compression/test_quantization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import torch
from accelerate.utils import align_module_device
from compressed_tensors.offload import dispatch_model
from compressed_tensors.quantization.utils import is_module_quantized
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -36,22 +37,23 @@ def _get_quant_info(model):
quant_info_weights = {}
quant_info_inputs = {}
for name, module in model.named_modules():
if is_module_quantized(module):
if module.quantization_scheme.weights is not None:
quant_info_weights[name] = (
module.weight_scale,
module.weight_zero_point,
module.weight,
)

if module.quantization_scheme.input_activations is not None:
is_dynamic = module.quantization_scheme.input_activations.dynamic
if not is_dynamic:
quant_info_inputs[name] = (
module.input_scale,
module.input_zero_point,
with align_module_device(module):
if is_module_quantized(module):
if module.quantization_scheme.weights is not None:
quant_info_weights[name] = (
module.weight_scale,
module.weight_zero_point,
module.weight,
)

if module.quantization_scheme.input_activations is not None:
is_dynamic = module.quantization_scheme.input_activations.dynamic
if not is_dynamic:
quant_info_inputs[name] = (
module.input_scale,
module.input_zero_point,
)

return quant_info_weights, quant_info_inputs


Expand Down Expand Up @@ -85,7 +87,7 @@ def setup_model_and_config(request, tmpdir_factory):
num_calibration_samples=num_calibration_samples,
recipe=config["new_recipe"],
pad_to_max_length=pad_to_max_length,
splits={"calibration": "train_gen[:1%]"},
splits={"calibration": f"train_gen[:{num_calibration_samples}]"},
save_compressed=False,
)

Expand Down