wip

kylesayrs · kylesayrs · commit 44dbf910ee24 · 2025-11-24T16:18:45.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -16,7 +16,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
+NUM_CALIBRATION_SAMPLES = 12
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -57,6 +57,7 @@ def tokenize(sample):
 oneshot(
     model=model,
     dataset=ds,
+    batch_size=12,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py
@@ -55,7 +55,6 @@ class SmoothQuantMapping:
 
     smooth_name: str
     smooth_layer: Module
-    balance_names: List[str]
     balance_layers: List[Module]
 
 
@@ -216,20 +215,18 @@ def _resolve_mappings(self, model: Module) -> List[SmoothQuantMapping]:
             to_smooth_layers = get_layers(to_smooth, model)
             for layer_name, smooth_layer in to_smooth_layers.items():
                 if not match_targets(layer_name, self.ignore)[0]:
-                    balance_names = []
                     balance_layers = []
                     for balance_suffix in to_balance:
                         # find the submodule that matches the activation layer
                         balance_name, balance_layer = get_matching_layer(
                             balance_suffix, layer_name, model
                         )
                         if balance_layer:
-                            balance_names.append(balance_name)
                             balance_layers.append(balance_layer)
                     # each mapping can contain multiple layers to balance, but only
                     # one layer to smooth
                     mapping = SmoothQuantMapping(
-                        layer_name, smooth_layer, balance_names, balance_layers
+                        layer_name, smooth_layer, balance_layers
                     )
                     resolved_mappings.append(mapping)
         return resolved_mappings
diff --git a/src/llmcompressor/modifiers/utils/pytorch_helpers.py b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
@@ -7,14 +7,11 @@
 tensor operations for compression workflows.
 """
 
-from typing import TYPE_CHECKING, Dict
+from typing import Dict
 
 import torch
 from torch.nn import Module
 
-if TYPE_CHECKING:
-    pass
-
 __all__ = [
     "apply_pad_mask_to_batch",
     "is_moe_model",
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -6,11 +6,11 @@
 from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
-from llmcompressor.core import LifecycleCallbacks
+from llmcompressor.core import LifecycleCallbacks, active_session
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
-from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
+from llmcompressor.utils import calibration_forward_context, dispatch_for_generation, targets_lm_head, disable_lm_head
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -38,13 +38,20 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        session = active_session()
+        modifiers = session.lifecycle.recipe.modifiers
+
         dispatch_for_generation(model)  # basic dispatch is identical to generation
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(calibration_forward_context(model))
+            # Optional disable lm_head
+            if not targets_lm_head(model, modifiers):
+                stack.enter_context(disable_lm_head(model))
+
             for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
                 batch = apply_pad_mask_to_batch(batch)
                 batch = tensors_to_device(batch, model_device)
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -24,9 +24,6 @@
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.utils.hooks import HooksMixin
 from llmcompressor.pipelines.sequential.transformers_helpers import HFTracer
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    targets_embeddings,
-)
 from llmcompressor.utils.helpers import calibration_forward_context, patch_attr
 from llmcompressor.utils.pytorch.module import get_no_split_params
 
@@ -40,7 +37,6 @@
     "Subgraph",
     "get_sequential_targets",
     "dispatch_for_sequential",
-    "targets_lm_head",
 ]
 
 
@@ -499,14 +495,6 @@ def get_sequential_targets(
         return sequential_targets
 
 
-def targets_lm_head(model: PreTrainedModel, modifiers: list[Modifier]) -> bool:
-    targets = sum(
-        (list(modifier.get_targets(model)) for modifier in modifiers), start=[]
-    )
-
-    return targets_embeddings(model, targets, check_input=True, check_output=False)
-
-
 def add_line_numbers(text: str) -> str:
     lines = text.splitlines()
     numbered_lines = [f"{i + 1} {line}" for i, line in enumerate(lines)]
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -13,13 +13,14 @@
 from llmcompressor.pipelines.sequential.helpers import (
     dispatch_for_sequential,
     get_sequential_targets,
-    targets_lm_head,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import (
     DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
+    targets_lm_head,
+    disable_lm_head,
 )
 
 if TYPE_CHECKING:
@@ -83,13 +84,15 @@ def __call__(
             type(mod).__name__ in DISABLE_QAC_MODIFIERS
             for mod in session.lifecycle.recipe.modifiers
         )
-        skip_lm_head = not targets_lm_head(model, modifiers)
 
         with contextlib.ExitStack() as stack:
-            stack.enter_context(calibration_forward_context(model, skip_lm_head))
+            stack.enter_context(calibration_forward_context(model))
             # Optionally disable quantization
             if not dataset_args.quantization_aware_calibration or disable_qac:
                 stack.enter_context(DisableQuantization(model))
+            # Optional disable lm_head
+            if not targets_lm_head(model, modifiers):
+                stack.enter_context(disable_lm_head(model))
 
             # prepare intermediates cache
             activations = IntermediatesCache.from_dataloader(dataloader, model_device)
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union, TYPE_CHECKING
 from urllib.parse import urlparse
 
 import numpy
@@ -28,6 +28,9 @@
 from loguru import logger
 from transformers import PreTrainedModel
 
+if TYPE_CHECKING:
+    from llmcompressor.modifiers.modifier import Modifier
+
 __all__ = [
     "ALL_TOKEN",
     "ALL_PRUNABLE_TOKEN",
@@ -68,6 +71,8 @@
     "calibration_forward_context",
     "patch_attr",
     "disable_hf_kernels",
+    "disable_lm_head",
+    "targets_lm_head",
     "DISABLE_QAC_MODIFIERS",
 ]
 
@@ -1042,23 +1047,20 @@ def disable_hf_kernels(module: torch.nn.Module):
 
 
 @contextlib.contextmanager
-def calibration_forward_context(model: torch.nn.Module, skip_lm_head: bool = False):
+def calibration_forward_context(model: torch.nn.Module):
     """
     Context in which all calibration forward passes should occur.
 
     - Remove gradient calculations
     - Disable the KV cache
     - Disable train mode and enable eval mode
     - Disable hf kernels which could bypass hooks
-    - Disable lm_head of model (optional)
     """
     with contextlib.ExitStack() as stack:
         stack.enter_context(torch.no_grad())
         stack.enter_context(disable_cache(model))
         stack.enter_context(eval_context(model))
         stack.enter_context(disable_hf_kernels(model))
-        if skip_lm_head:
-            stack.enter_context(disable_lm_head(model))
 
         yield
 
@@ -1091,7 +1093,19 @@ def disable_lm_head(model: torch.nn.Module):
         yield
 
 
-# TODO: deprecate
+def targets_lm_head(model: PreTrainedModel, modifiers: list["Modifier"]) -> bool:
+    """ Returns True if the given modifiers target the lm_head """
+    from llmcompressor.transformers.compression.compressed_tensors_utils import (
+        targets_embeddings
+    )
+
+    targets = sum(
+        (list(modifier.get_targets(model)) for modifier in modifiers), start=[]
+    )
+    return targets_embeddings(model, targets, check_input=True, check_output=False)
+    
+
+
 @contextlib.contextmanager
 def patch_attr(base: object, attr: str, value: Any):
     """