Merge remote-tracking branch 'origin' into kylesayrs/testing-device-map

kylesayrs · kylesayrs · commit bd8db69b774b · 2025-06-20T18:13:33.000-04:00
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 Big updates have landed in LLM Compressor! Check out these exciting new features:
 
 * **Preliminary FP4 Quantization Support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [weight-only quantization](examples/quantization_w4a16_fp4/llama3_example.py) and [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py). Support is currently preliminary and additional support will be added for MoEs.
-* **Axolotl Sparse Finetuning Integration:** Easily finetune sparse LLMs through our seamless integration with Axolotl. [Learn more here](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
+* **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
 * **AutoAWQ Integration:** Perform low-bit weight-only quantization efficiently using AutoAWQ, now part of LLM Compressor. *Note: This integration should be considered experimental for now. Enhanced support, including for MoE models and improved handling of larger models via layer sequential pipelining, is planned for upcoming releases.* [See the details](https://github.com/vllm-project/llm-compressor/pull/1177).
 * **Day 0 Llama 4 Support:** Meta utilized LLM Compressor to create the [FP8-quantized Llama-4-Maverick-17B-128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8), optimized for vLLM inference using [compressed-tensors](https://github.com/neuralmagic/compressed-tensors) format.
 
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -1,9 +1,10 @@
+from pathlib import Path
+
 import torch
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot, train
-from llmcompressor.utils import dispatch_for_generation
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
@@ -18,6 +19,7 @@
 
 # save location of quantized model
 output_dir = "output_llama7b_2of4_w4a16_channel"
+output_path = Path(output_dir)
 
 # set dataset config parameters
 splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
@@ -63,25 +65,26 @@
 # ./output_llama7b_2of4_w4a16_channel/ + (finetuning/sparsity/quantization)_stage
 
 # Oneshot sparsification
-oneshot_applied_model = oneshot(
+
+oneshot(
     model=model,
     **oneshot_kwargs,
+    output_dir=output_dir,
     stage="sparsity_stage",
 )
 
 # Sparse finetune
-dispatch_for_generation(model)
-finetune_applied_model = train(
-    model=oneshot_applied_model,
+train(
+    model=(output_path / "sparsity_stage"),
     **oneshot_kwargs,
     **training_kwargs,
+    output_dir=output_dir,
     stage="finetuning_stage",
 )
 
 # Oneshot quantization
-model.to("cpu")
 quantized_model = oneshot(
-    model=finetune_applied_model,
+    model=(output_path / "finetuning_stage"),
     **oneshot_kwargs,
     stage="quantization_stage",
 )
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -67,10 +67,11 @@ def tokenize(sample):
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
             "will execute code present on the Hub on your local machine."
         },
     )
+    # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
         metadata={
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -1,10 +1,10 @@
 import os
 from datetime import datetime
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from loguru import logger
 from torch.utils.data import DataLoader
-from transformers import PreTrainedModel
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
 
 from llmcompressor.args import parse_args
 from llmcompressor.core.session_functions import active_session
@@ -14,6 +14,9 @@
 
 __all__ = ["Oneshot", "oneshot"]
 
+if TYPE_CHECKING:
+    from datasets import Dataset, DatasetDict
+
 
 class Oneshot:
     """
@@ -102,7 +105,8 @@ def __init__(
         :param recipe_args: RecipeArguments parameters, responsible for containing
             recipe-related parameters
         :param output_dir: Path to save the output model after carrying out oneshot
-
+        :param log_dir: Path to save logs during oneshot run.
+            Nothing is logged to file if None.
         """
         # Set up logging
         if log_dir:
@@ -191,8 +195,119 @@ def apply_recipe_modifiers(
         session.finalize()
 
 
-def oneshot(**kwargs) -> PreTrainedModel:
-    one_shot = Oneshot(**kwargs)
+def oneshot(
+    # Model arguments
+    model: Union[str, PreTrainedModel],
+    distill_teacher: Optional[str] = None,
+    config_name: Optional[str] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
+    cache_dir: Optional[str] = None,
+    use_auth_token: bool = False,
+    precision: str = "auto",
+    tie_word_embeddings: bool = False,
+    trust_remote_code_model: bool = False,
+    save_compressed: bool = True,
+    oneshot_device: str = "cuda:0",
+    model_revision: str = "main",
+    # Recipe arguments
+    recipe: Optional[Union[str, List[str]]] = None,
+    recipe_args: Optional[List[str]] = None,
+    clear_sparse_session: bool = False,
+    stage: Optional[str] = None,
+    # Dataset arguments
+    dataset: Optional[Union[str, "Dataset", "DatasetDict"]] = None,
+    dataset_config_name: Optional[str] = None,
+    dataset_path: Optional[str] = None,
+    num_calibration_samples: int = 512,
+    shuffle_calibration_samples: bool = True,
+    max_seq_length: int = 384,
+    pad_to_max_length: bool = True,
+    text_column: str = "text",
+    concatenate_data: bool = False,
+    streaming: bool = False,
+    overwrite_cache: bool = False,
+    preprocessing_num_workers: Optional[int] = None,
+    min_tokens_per_module: Optional[float] = None,
+    trust_remote_code_data: bool = False,
+    # Miscellaneous arguments
+    output_dir: Optional[str] = None,
+    log_dir: Optional[str] = "sparse_logs",
+    **kwargs,
+) -> PreTrainedModel:
+    """
+    Performs oneshot calibration on a model.
+
+    # Model arguments
+    :param model: A pretrained model identifier from huggingface.co/models or a path
+        to a local model. Required parameter.
+    :param distill_teacher: Teacher model (a trained text generation model)
+        for distillation.
+    :param config_name: Pretrained config name or path if not the same as
+        model_name.
+    :param tokenizer: Pretrained tokenizer name or path if not the same as
+        model_name.
+    :param processor: Pretrained processor name or path if not the same as
+        model_name.
+    :param cache_dir: Where to store the pretrained data from
+        huggingface.co.
+    :param use_auth_token: Whether to use Hugging Face auth token for private
+        models.
+    :param precision: Precision to cast model weights to, default to auto.
+    :param tie_word_embeddings: Whether the model's input and output word embeddings
+        should be tied.
+    :param trust_remote_code_model: Whether to allow for custom models to execute
+        their own modeling files.
+    :param save_compressed: Whether to compress sparse models during save.
+    :param oneshot_device: Device to run oneshot calibration on.
+    :param model_revision: The specific model version to use (can be branch name,
+        tag, or commit id).
+
+    # Recipe arguments
+    :param recipe: Path to a LLM Compressor sparsification recipe.
+    :param recipe_args: List of recipe arguments to evaluate, in the
+        format "key1=value1", "key2=value2".
+    :param clear_sparse_session: Whether to clear CompressionSession/
+        CompressionLifecycle data between runs.
+    :param stage: The stage of the recipe to use for oneshot.
+
+    # Dataset arguments
+    :param dataset: The name of the dataset to use (via the datasets
+        library).
+    :param dataset_config_name: The configuration name of the dataset
+        to use.
+    :param dataset_path: Path to a custom dataset. Supports json, csv, dvc.
+    :param num_calibration_samples: Number of samples to use for one-shot
+        calibration.
+    :param shuffle_calibration_samples: Whether to shuffle the dataset before
+        calibration.
+    :param max_seq_length: Maximum total input sequence length after tokenization.
+    :param pad_to_max_length: Whether to pad all samples to `max_seq_length`.
+    :param text_column: Key to use as the `text` input to tokenizer/processor.
+    :param concatenate_data: Whether to concatenate datapoints to fill
+        max_seq_length.
+    :param streaming: True to stream data from a cloud dataset.
+    :param overwrite_cache: Whether to overwrite the cached preprocessed datasets.
+    :param preprocessing_num_workers: Number of processes for
+        preprocessing.
+    :param min_tokens_per_module: Minimum percentage of tokens per
+        module, relevant for MoE models.
+    :param trust_remote_code_data: Whether to allow for datasets defined on the Hub
+        using a dataset script.
+
+    # Miscellaneous arguments
+    :param output_dir: Path to save the output model after calibration.
+        Nothing is saved if None.
+    :param log_dir: Path to save logs during oneshot run.
+        Nothing is logged to file if None.
+
+    :return: The calibrated PreTrainedModel
+    """
+
+    # pass all args directly into Oneshot
+    local_args = locals()
+    local_args.pop("kwargs")
+    one_shot = Oneshot(**local_args, **kwargs)
     one_shot()
 
     return one_shot.model
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -72,10 +72,11 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
-            del global_scale
+        update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
+
+        del global_scale
 
     if _is_mlp_module(submodule):
         if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
@@ -91,10 +92,7 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(
-                submodule.gate_proj, global_scale, "weight_global_scale"
-            )
-            update_parameter_data(
-                submodule.up_proj, global_scale, "weight_global_scale"
-            )
-            del global_scale
+        update_parameter_data(submodule.gate_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.up_proj, global_scale, "weight_global_scale")
+
+        del global_scale
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -9,8 +9,7 @@
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
-from llmcompressor.utils.dev import dispatch_for_generation
-from llmcompressor.utils.helpers import calibration_forward_context
+from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
@@ -61,7 +61,7 @@ def empty(cls, num_batches: int, offload_device: torch.device):
     def from_dataloader(
         cls,
         dataloader: torch.utils.data.DataLoader,
-        model_device: torch.device,
+        model_device: torch.device = torch.device("cpu"),
         mask_padding: bool = True,
         offload_device: Optional[torch.device] = torch.device("cpu"),
     ):
diff --git a/src/llmcompressor/pipelines/layer_sequential/helpers.py b/src/llmcompressor/pipelines/layer_sequential/helpers.py
@@ -6,7 +6,6 @@
 import torch
 import tqdm
 from compressed_tensors.quantization import find_name_or_class_matches
-from compressed_tensors.utils import get_execution_device
 from torch.nn import Module
 from torch.utils.data.dataloader import DataLoader
 
@@ -45,6 +44,7 @@ def capture_first_layer_intermediates(
     model: Module,
     first_layer: Module,
     dataloader: DataLoader,
+    model_device: torch.device = torch.device("cpu"),
     mask_padding: bool = True,
 ) -> IntermediatesCache:
     """
@@ -62,7 +62,6 @@ def capture_first_layer_intermediates(
     :param mask_padding: zero out padding tokens if True. This affects modifiers such as
         GPTQ and SparseGPT
     """
-    model_device = get_execution_device(model)
     intermediates = IntermediatesCache.empty(len(dataloader), torch.device("cpu"))
     signature = inspect.signature(first_layer.forward)
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,7 +2,7 @@
 
 import torch
 import tqdm
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -60,6 +60,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # find layers
         modifiers = session.get_modifiers()
@@ -71,7 +72,7 @@ def __call__(
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             intermediates: IntermediatesCache = capture_first_layer_intermediates(
-                model, layers[0], dataloader
+                model, layers[0], dataloader, model_device
             )
 
             num_layers = len(layers)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -54,6 +54,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = get_execution_device(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
@@ -69,7 +70,6 @@ def __call__(
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
-            model_device = get_execution_device(model)
             activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
             for subgraph_index, subgraph in enumerate(subgraphs):
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -270,7 +270,7 @@ def compute_loss(
         model: Module,
         inputs: Dict[str, Any],
         return_outputs: bool = False,
-        num_items_in_batch: Optional[int] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Any]]:
         """
         Override for the compute_loss to factor trigger callbacks and filter columns
@@ -279,6 +279,7 @@ def compute_loss(
         :param inputs: the inputs to pass through the model for calculating the loss
         :param return_outputs: True to return the outputs with the loss,
             False otherwise
+        :param num_items_in_batch: the number of items which contribute to loss
         :return: the resulting loss if not return_outputs, otherwise a tuple
             containing the loss and the model's outputs
         """
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
@@ -126,11 +126,17 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     """
     remove_dispatch(model)
 
+    no_split_module_classes = model._get_no_split_modules("auto")
     max_memory = get_balanced_memory(
         model,
         dtype=model.dtype,
-        no_split_module_classes=model._get_no_split_modules("auto"),
+        no_split_module_classes=no_split_module_classes,
+    )
+    device_map = infer_auto_device_map(
+        model,
+        dtype=model.dtype,
+        max_memory=max_memory,
+        no_split_module_classes=no_split_module_classes,
     )
-    device_map = infer_auto_device_map(model, dtype=model.dtype, max_memory=max_memory)
 
     return dispatch_model(model, device_map=device_map)
diff --git a/tests/e2e/vLLM/configs/kv_cache_phi3.yaml b/tests/e2e/vLLM/configs/kv_cache_phi3.yaml
@@ -4,4 +4,5 @@ model: microsoft/Phi-3-mini-4k-instruct
 recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-scheme: kv_cache_default_phi3
+scheme: kv_cache_default_phi3
+gpu_memory_utilization: 0.8
diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml

Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):`
`171`	`171`	`"will execute code present on the Hub on your local machine."`
`172`	`172`	`},`
`173`	`173`	`)`
	`174`	`+ # --- pipeline arguments --- #`
`174`	`175`	`pipeline: Optional[str] = field(`
`175`	`176`	`default="independent",`
`176`	`177`	`metadata={`