implement requires_lm_head_calibration, disable_lm_head

kylesayrs · kylesayrs · commit e3e009fca033 · 2025-12-01T20:14:09.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -6,11 +6,16 @@
 from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
-from llmcompressor.core import LifecycleCallbacks
+from llmcompressor.core import LifecycleCallbacks, active_session
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
-from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
+from llmcompressor.utils import (
+    calibration_forward_context,
+    disable_lm_head,
+    dispatch_for_generation,
+    requires_lm_head_calibration,
+)
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -38,13 +43,20 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        session = active_session()
+        modifiers = session.lifecycle.recipe.modifiers
+
         dispatch_for_generation(model)  # basic dispatch is identical to generation
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(calibration_forward_context(model))
+            # Optionally disable lm_head
+            if not requires_lm_head_calibration(model, modifiers):
+                stack.enter_context(disable_lm_head(model))
+
             for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
                 batch = apply_pad_mask_to_batch(batch)
                 batch = tensors_to_device(batch, model_device)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -19,6 +19,8 @@
     DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
+    disable_lm_head,
+    requires_lm_head_calibration,
 )
 
 if TYPE_CHECKING:
@@ -88,6 +90,9 @@ def __call__(
             # Optionally disable quantization
             if not dataset_args.quantization_aware_calibration or disable_qac:
                 stack.enter_context(DisableQuantization(model))
+            # Optionally disable lm_head
+            if not requires_lm_head_calibration(model, modifiers):
+                stack.enter_context(disable_lm_head(model))
 
             # prepare intermediates cache
             activations = IntermediatesCache.from_dataloader(dataloader, model_device)
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -18,15 +18,21 @@
 from collections import OrderedDict
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Tuple, Union
 from urllib.parse import urlparse
 
 import numpy
 import torch
+from compressed_tensors import has_offloaded_params, match_named_modules
 from compressed_tensors.quantization import disable_quantization, enable_quantization
 from loguru import logger
 from transformers import PreTrainedModel
 
+from llmcompressor.utils import get_embeddings, targets_embeddings
+
+if TYPE_CHECKING:
+    from llmcompressor.modifiers import Modifier
+
 __all__ = [
     "ALL_TOKEN",
     "ALL_PRUNABLE_TOKEN",
@@ -65,6 +71,8 @@
     "DisableQuantization",
     "eval_context",
     "calibration_forward_context",
+    "disable_lm_head",
+    "requires_lm_head_calibration",
     "patch_attr",
     "disable_hf_kernels",
     "DISABLE_QAC_MODIFIERS",
@@ -1050,12 +1058,54 @@ def calibration_forward_context(model: torch.nn.Module):
     - Disable train mode and enable eval mode
     - Disable hf kernels which could bypass hooks
     """
-    with torch.no_grad(), disable_cache(model), eval_context(model), disable_hf_kernels(
-        model
-    ):
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(torch.no_grad())
+        stack.enter_context(disable_cache(model))
+        stack.enter_context(eval_context(model))
+        stack.enter_context(disable_hf_kernels(model))
         yield
 
 
+@contextlib.contextmanager
+def disable_lm_head(model: torch.nn.Module):
+    """
+    Disable the lm_head of a model by moving it to the meta device. This function
+    does not untie parameters and restores the model proper loading upon exit
+    """
+    _, lm_head = get_embeddings(model)
+    if lm_head is not None:
+        if has_offloaded_params(lm_head):
+            # keep weight on meta device
+            with patch_attr(lm_head._hf_hook, "offload", False):
+                yield
+        else:
+            with patch_attr(lm_head, "weight", lm_head.weight.to("meta")):
+                yield
+
+    else:
+        logger.warning(
+            f"Attempted to disable lm_head of instance {model.__class__.__name__}, "
+            "but was unable to to find lm_head. This may lead to unexpected OOM."
+        )
+        yield
+
+
+def requires_lm_head_calibration(
+    model: PreTrainedModel, modifiers: Iterable["Modifier"]
+) -> bool:
+    """Returns True if any of the quantization modifers target the lm_head"""
+    from llmcompressor.modifiers.quantization.quantization.mixin import (
+        QuantizationMixin,
+    )
+
+    targets = set()
+    for mod in modifiers:
+        if isinstance(mod, QuantizationMixin):
+            targets |= set(match_named_modules(model, mod.resolved_targets, mod.ignore))
+
+    return targets_embeddings(model, targets, check_input=True, check_output=False)
+
+
 @contextlib.contextmanager
 def patch_attr(base: object, attr: str, value: Any):
     """