[Offloading] Support Disk Offloading (vllm-project#2373)

kylesayrs · yiliu30 · commit 28a6e31a5e7c · 2026-03-06T12:16:22.000Z
## Purpose ## * Support disk offloading for very large models ## Prerequisites ## * [[Offload] Convert accelerate for loading/saving](vllm-project/compressed-tensors#572) ## Examples ## * Add `examples/disk_offloading/qwen3_example.py` * Add `examples/disk_offloading/kimi_k2_example.py` ## Changes ## ### Required ### * Remove post-processing step where `remove_dispatch` is called * Previously, this was used to avoid conflicts between `dispatch_for_sequential` and `dispatch_for_generation`. * Now, the two functions are directly compatible: you don't need to remove the dispatch of one to use the other * Add `to_accelerate` to `save_pretrained_wrapper` * This ensures that the model is converted to `accelerate` offloading before saving * This ensures the best compatibility with `save_pretrained`, and reduces excess memory usage which would cause gpu/cpu ooms ### Hardening / Future Pre Changes ### * During oneshot preprocessing, convert `from_accelerate` if possible. This guards against users who load their model outside of the `load_offloaded_model` context * Remove `offload_device` arguemnt from `dispatch_for_sequential` to avoid deprecation warning * `dispatch_for_sequential` now always respects the device the model was loaded on ## Testing ## * Ran `Qwen/Qwen3-0.6B` example to completion * [IN PROGRESS] Run `unsloth/Kimi-K2-Instruct-0905-BF16` example to completion --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: yiliu30 <yi4.liu@intel.com>
diff --git a/examples/disk_offloading/kimi_k2_example.py b/examples/disk_offloading/kimi_k2_example.py
@@ -0,0 +1,50 @@
+from compressed_tensors.offload import get_device_map, load_offloaded_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Select model and load it in the `load_offloaded_model` context
+with load_offloaded_model():
+    model_id = "unsloth/Kimi-K2-Instruct-0905-BF16"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        dtype="auto",
+        device_map="auto_offload",  # fit as much as possible on cpu, rest goes on disk
+        trust_remote_code=True,
+        offload_folder="./offload_folder",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+# Confirm that model is dispatched correctly
+devices = {offloaded for _onloaded, offloaded in get_device_map(model).values()}
+print(f"Model was offloaded to the following devices: {devices}")
+
+# Select calibration dataset.
+DATASET_ID = "ultrachat-200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 2048
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to NVFP4
+recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    processor=tokenizer,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/disk_offloading/qwen3_example.py b/examples/disk_offloading/qwen3_example.py
@@ -0,0 +1,65 @@
+from compressed_tensors.offload import (
+    dispatch_model,
+    get_device_map,
+    load_offloaded_model,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Select model and load it in the `load_offloaded_model` context
+# In this example, we emulate large model quantization with disk offloading by
+# restricting the theoretical size of CPU RAM to be smaller than the size of the model
+with load_offloaded_model():
+    model_id = "Qwen/Qwen3-0.6B"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        dtype="auto",
+        device_map="auto_offload",  # fit as much as possible on cpu, rest goes on disk
+        max_memory={"cpu": 6e8},  # remove this line to use as much cpu as possible
+        offload_folder="./offload_folder",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Confirm that model is dispatched correctly
+devices = {offloaded for _onloaded, offloaded in get_device_map(model).values()}
+print(f"Model was offloaded to the following devices: {devices}")
+
+# Select calibration dataset.
+DATASET_ID = "ultrachat-200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 2048
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to NVFP4
+recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -10,7 +10,7 @@
 import os
 from pathlib import PosixPath
 
-from compressed_tensors.utils import remove_dispatch
+from compressed_tensors.offload import from_accelerate
 from loguru import logger
 from transformers import (
     AutoConfig,
@@ -84,6 +84,10 @@ def pre_process(
     if not model_args.tie_word_embeddings:
         untie_word_embeddings(model_args.model)
 
+    # if the model was loaded with accelerate offloading, convert to CT offloading
+    if hasattr(model_args.model, "hf_device_map"):
+        from_accelerate(model_args.model)
+
     # wrap model.save_pretrained
     modify_save_pretrained(model_args.model)
 
@@ -104,10 +108,6 @@ def post_process(
     Raises:
         ValueError: If saving fails due to an invalid `output_dir` or other issues.
     """
-    # remove any existing dispatches
-    if model_args is not None and model_args.model is not None:
-        remove_dispatch(model_args.model)
-
     if model_args is not None and output_dir is not None:
         if recipe_args is not None and getattr(recipe_args, "stage", None) is not None:
             output_dir = os.path.join(output_dir, recipe_args.stage)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -532,7 +532,7 @@ def is_ancestor(module: Module) -> bool:
 def dispatch_for_sequential(
     model: PreTrainedModel,
     onload_device: Optional[torch.device | str] = None,
-    offload_device: torch.device | str = torch.device("cpu"),
+    offload_device: Optional[torch.device | str] = None,
 ) -> PreTrainedModel:
     """
     Dispatch a model for sequential calibration using a sequential pipeline.
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -97,7 +97,7 @@ def __call__(
         # prepare model for sequential onloading
         onload_device = get_main_device()
         offload_device = torch.device(dataset_args.sequential_offload_device)
-        dispatch_for_sequential(model, onload_device, offload_device)
+        dispatch_for_sequential(model, onload_device)
 
         # prepare to trace subgraphs
         modifiers = session.lifecycle.recipe.modifiers
diff --git a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py
@@ -9,7 +9,7 @@
     SparsityCompressionConfig,
 )
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.offload import is_rank0
+from compressed_tensors.offload import is_rank0, to_accelerate
 from loguru import logger
 from transformers import PreTrainedModel
 
@@ -90,6 +90,9 @@ def save_pretrained_wrapper(
                 compressor.compress_model(model)
 
             if is_rank0():
+                # convert to accelerate offloaded for optimal saving with transformers
+                to_accelerate(model)
+
                 # save (compressed) model structure
                 original_save_pretrained.__get__(model, model_class)(
                     save_directory,
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -1,5 +1,6 @@
 import pytest
 import torch
+from accelerate.utils import align_module_device
 from compressed_tensors.offload import dispatch_model
 from compressed_tensors.quantization.utils import is_module_quantized
 from torch.utils.data import DataLoader
@@ -36,22 +37,23 @@ def _get_quant_info(model):
     quant_info_weights = {}
     quant_info_inputs = {}
     for name, module in model.named_modules():
-        if is_module_quantized(module):
-            if module.quantization_scheme.weights is not None:
-                quant_info_weights[name] = (
-                    module.weight_scale,
-                    module.weight_zero_point,
-                    module.weight,
-                )
-
-            if module.quantization_scheme.input_activations is not None:
-                is_dynamic = module.quantization_scheme.input_activations.dynamic
-                if not is_dynamic:
-                    quant_info_inputs[name] = (
-                        module.input_scale,
-                        module.input_zero_point,
+        with align_module_device(module):
+            if is_module_quantized(module):
+                if module.quantization_scheme.weights is not None:
+                    quant_info_weights[name] = (
+                        module.weight_scale,
+                        module.weight_zero_point,
+                        module.weight,
                     )
 
+                if module.quantization_scheme.input_activations is not None:
+                    is_dynamic = module.quantization_scheme.input_activations.dynamic
+                    if not is_dynamic:
+                        quant_info_inputs[name] = (
+                            module.input_scale,
+                            module.input_zero_point,
+                        )
+
     return quant_info_weights, quant_info_inputs
 
 
@@ -85,7 +87,7 @@ def setup_model_and_config(request, tmpdir_factory):
         num_calibration_samples=num_calibration_samples,
         recipe=config["new_recipe"],
         pad_to_max_length=pad_to_max_length,
-        splits={"calibration": "train_gen[:1%]"},
+        splits={"calibration": f"train_gen[:{num_calibration_samples}]"},
         save_compressed=False,
     )