always disable

kylesayrs · kylesayrs · commit 61ca3ce88822 · 2025-12-01T23:36:37.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -6,15 +6,13 @@
 from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
-from llmcompressor.core import LifecycleCallbacks, active_session
+from llmcompressor.core import LifecycleCallbacks
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
 from llmcompressor.utils import (
     calibration_forward_context,
-    disable_lm_head,
     dispatch_for_generation,
-    requires_lm_head_calibration,
 )
 
 if TYPE_CHECKING:
@@ -43,20 +41,13 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
-        session = active_session()
-        modifiers = session.lifecycle.recipe.modifiers
-
         dispatch_for_generation(model)  # basic dispatch is identical to generation
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(calibration_forward_context(model))
-            # Optionally disable lm_head
-            if not requires_lm_head_calibration(model, modifiers):
-                stack.enter_context(disable_lm_head(model))
-
             for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
                 batch = apply_pad_mask_to_batch(batch)
                 batch = tensors_to_device(batch, model_device)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -19,8 +19,6 @@
     DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
-    disable_lm_head,
-    requires_lm_head_calibration,
 )
 
 if TYPE_CHECKING:
@@ -91,8 +89,6 @@ def __call__(
             if not dataset_args.quantization_aware_calibration or disable_qac:
                 stack.enter_context(DisableQuantization(model))
             # Optionally disable lm_head
-            if not requires_lm_head_calibration(model, modifiers):
-                stack.enter_context(disable_lm_head(model))
 
             # prepare intermediates cache
             activations = IntermediatesCache.from_dataloader(dataloader, model_device)
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -23,15 +23,14 @@
 
 import numpy
 import torch
-from compressed_tensors import has_offloaded_params, match_named_modules
 from compressed_tensors.quantization import disable_quantization, enable_quantization
 from loguru import logger
 from transformers import PreTrainedModel
 
-from llmcompressor.utils import get_embeddings, targets_embeddings
+from llmcompressor.utils import get_embeddings
 
 if TYPE_CHECKING:
-    from llmcompressor.modifiers import Modifier
+    pass
 
 __all__ = [
     "ALL_TOKEN",
@@ -72,7 +71,6 @@
     "eval_context",
     "calibration_forward_context",
     "disable_lm_head",
-    "requires_lm_head_calibration",
     "patch_attr",
     "disable_hf_kernels",
     "DISABLE_QAC_MODIFIERS",
@@ -1057,12 +1055,14 @@ def calibration_forward_context(model: torch.nn.Module):
     - Disable the KV cache
     - Disable train mode and enable eval mode
     - Disable hf kernels which could bypass hooks
+    - Disable lm head (input and weights can still be calibrated, output will be meta)
     """
     with contextlib.ExitStack() as stack:
         stack.enter_context(torch.no_grad())
         stack.enter_context(disable_cache(model))
         stack.enter_context(eval_context(model))
         stack.enter_context(disable_hf_kernels(model))
+        stack.enter_context(disable_lm_head(model))
         yield
 
 
@@ -1074,13 +1074,18 @@ def disable_lm_head(model: torch.nn.Module):
     """
     _, lm_head = get_embeddings(model)
     if lm_head is not None:
-        if has_offloaded_params(lm_head):
-            # keep weight on meta device
-            with patch_attr(lm_head._hf_hook, "offload", False):
-                yield
-        else:
-            with patch_attr(lm_head, "weight", lm_head.weight.to("meta")):
-                yield
+        if not isinstance(lm_head, torch.nn.Linear):
+            raise NotImplementedError(
+                f"Cannot disable LM head of type {lm_head.__class__.__name__}"
+            )
+
+        dummy_weight = lm_head.weight.to("meta")
+
+        def dummy_forward(self, input: torch.Tensor) -> torch.Tensor:
+            return input.to("meta") @ dummy_weight.T
+
+        with patch_attr(lm_head, "forward", dummy_forward.__get__(lm_head)):
+            yield
 
     else:
         logger.warning(
@@ -1090,22 +1095,6 @@ def disable_lm_head(model: torch.nn.Module):
         yield
 
 
-def requires_lm_head_calibration(
-    model: PreTrainedModel, modifiers: Iterable["Modifier"]
-) -> bool:
-    """Returns True if any of the quantization modifers target the lm_head"""
-    from llmcompressor.modifiers.quantization.quantization.mixin import (
-        QuantizationMixin,
-    )
-
-    targets = set()
-    for mod in modifiers:
-        if isinstance(mod, QuantizationMixin):
-            targets |= set(match_named_modules(model, mod.resolved_targets, mod.ignore))
-
-    return targets_embeddings(model, targets, check_input=False, check_output=True)
-
-
 @contextlib.contextmanager
 def patch_attr(base: object, attr: str, value: Any):
     """
diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py
@@ -5,23 +5,23 @@
 from transformers import (
     AutoModelForCausalLM,
     MllamaForConditionalGeneration,
-    PretrainedConfig,
-    PreTrainedModel,
 )
 
+from llmcompressor.pipelines.sequential.helpers import dispatch_for_sequential
 from llmcompressor.utils import (
     ALL_TOKEN,
     DisableQuantization,
     calibration_forward_context,
     convert_to_bool,
     disable_cache,
+    disable_lm_head,
     flatten_iterable,
     getattr_chain,
     interpolate,
     patch_attr,
     validate_str_iterable,
 )
-from llmcompressor.utils.dev import skip_weights_download
+from llmcompressor.utils.dev import dispatch_for_generation, skip_weights_download
 from tests.testing_utils import requires_gpu
 
 
@@ -149,20 +149,21 @@ def test_DisableQuantization():
 
 @pytest.mark.unit
 def test_calibration_forward_context():
-    class DummyModel(PreTrainedModel):
-        config_class = PretrainedConfig
-
-    model = DummyModel(PretrainedConfig())
+    with skip_weights_download():
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
     model.config.use_cache = True
     model.train()
 
     with calibration_forward_context(model):
         assert not torch.is_grad_enabled()
         assert not model.config.use_cache
         assert not model.training
+        assert model.lm_head.forward.__name__ == "dummy_forward"
+
     assert torch.is_grad_enabled()
     assert model.config.use_cache
     assert model.training
+    assert model.lm_head.forward.__name__ == "forward"
 
 
 @pytest.mark.unit
@@ -203,3 +204,29 @@ def test_disable_cache(model_cls, model_stub):
 
     output = model(**inputs)
     assert output.past_key_values is not None
+
+
+@requires_gpu
+@pytest.mark.parametrize("offload", ["sequential", "basic", "none"])
+def test_disable_lm_head(offload):
+    model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
+    if offload == "sequential":
+        dispatch_for_sequential(model)
+    if offload == "basic":
+        dispatch_for_generation(model)
+    if offload == "none":
+        model = model.to("cuda")
+
+    lm_input_device = None
+
+    def hook(module, args):
+        nonlocal lm_input_device
+        lm_input_device = args[0].device
+
+    model.lm_head.register_forward_pre_hook(hook)
+
+    with disable_lm_head(model):
+        input = {key: value.to("cuda") for key, value in model.dummy_inputs.items()}
+        output = model(**input)
+        assert lm_input_device == torch.device("cuda:0")
+        assert output.logits.device == torch.device("meta")