refactor: address HDCharles review comments

dzhengAP · dzhengAP · commit da7ef54edf50 · 2026-03-16T16:37:46.000-07:00
- rename flush_activation_qparams -&gt; write_activation_qparams
- rename calibrate_module_from_observer -&gt; update_module_qparams_from_observer
- extract ACTIVATION_BASE_NAMES constant in calibration.py
- move SEQUENTIAL_EPOCH_END docstring note from on_start to on_event
- use ExitStack for propagation pass quantization management
- update observer.forward() to accumulate stats alongside computing qparams
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -17,7 +17,7 @@
 from torch.nn import Module
 
 from llmcompressor.observers import Observer
-from llmcompressor.observers.base import calibrate_module_from_observer
+from llmcompressor.observers.base import update_module_qparams_from_observer
 
 __all__ = [
     "initialize_observer",
@@ -31,9 +31,12 @@
     "calibrate_query_hook",
     "calibrate_key_hook",
     "calibrate_value_hook",
-    "flush_activation_qparams",
+    "write_activation_qparams",
 ]
 
+# Activation observer base names used across calibration and quantization code
+ACTIVATION_BASE_NAMES = ("input", "output", "q", "k", "v")
+
 
 def initialize_observer(
     module: Module,
@@ -171,7 +174,7 @@ def calibrate_activations(
     :param stats_only: if True, only update running statistics in the observer
         (accumulate min/max) without computing or writing scale/zero_point.
         Used during deferred qparam calibration — qparams are computed once
-        at epoch end via flush_activation_qparams instead of per batch.
+        at epoch end via write_activation_qparams instead of per batch.
     """
     # If empty tensor, can't update zp/scale
     # Case for MoEs
@@ -193,7 +196,7 @@ def calibrate_activations(
 
     # In deferred (stats_only) mode: call the observer to accumulate running
     # min/max stats but do NOT write scale/zero_point yet.
-    # Qparams are written once at epoch end via flush_activation_qparams.
+    # Qparams are written once at epoch end via write_activation_qparams.
     if stats_only:
         observer = getattr(module, f"{base_name}_observer", None)
         if observer is not None:
@@ -213,7 +216,7 @@ def calibrate_input_hook(module: Module, args: Any):
     """
     Hook to accumulate input activation statistics (min/max) in the observer.
     Scale and zero_point are not written here; they are computed once per subgraph
-    at epoch end via flush_activation_qparams.
+    at epoch end via write_activation_qparams.
     """
     args = args[0] if isinstance(args, tuple) else args
     calibrate_activations(module, value=args, base_name="input", stats_only=True)
@@ -223,7 +226,7 @@ def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor):
     """
     Hook to accumulate output activation statistics (min/max) in the observer.
     Scale and zero_point are not written here; they are computed once per subgraph
-    at epoch end via flush_activation_qparams.
+    at epoch end via write_activation_qparams.
     Note: forward_quantize is intentionally absent — hooks only collect statistics.
     """
     calibrate_activations(
@@ -287,7 +290,7 @@ def reset_quantization_status(model: Module):
             delattr(module, "quantization_status")
 
 
-def flush_activation_qparams(module: Module):
+def write_activation_qparams(module: Module):
     """
     Compute and write final activation qparams from each observer's accumulated
     running statistics, then free those statistics to reduce memory.
@@ -301,13 +304,13 @@ def flush_activation_qparams(module: Module):
 
     apply to targeted modules with:
         for _, module in match_named_modules(...):
-            flush_activation_qparams(module)
+            write_activation_qparams(module)
 
     :param module: module to flush activation qparams for
     """
     scheme = getattr(module, "quantization_scheme", None)
     if scheme is None:
         return
 
-    for base_name in ("input", "output", "q", "k", "v"):
-        calibrate_module_from_observer(module, base_name)
+    for base_name in ACTIVATION_BASE_NAMES:
+        update_module_qparams_from_observer(module, base_name)
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -4,7 +4,7 @@
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.quantization.calibration import (
-    flush_activation_qparams,
+    write_activation_qparams,
     update_weight_global_scale,
     update_weight_zp_scale,
 )
@@ -67,8 +67,6 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         """
         Begin calibrating activations and weights. Calibrate weights only once on start.
-        Activation qparams are computed once per subgraph at SEQUENTIAL_EPOCH_END via
-        flush_activation_qparams, rather than per batch.
         """
         self.started_ = True
         QuantizationMixin.start_calibration(self, state.model)
@@ -99,12 +97,13 @@ def on_event(self, state: State, event: Event, **kwargs):
                 self.on_start(state, None)
 
         if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
-            # Compute scale/zero_point once from accumulated running statistics,
-            # then free those stats to reduce memory.
+            # Activation qparams are computed once per subgraph at SEQUENTIAL_EPOCH_END
+            # from accumulated running statistics, rather than per batch.
+            # Running statistics are freed after qparams are written to reduce memory.
             for _, module in match_named_modules(
                 state.model, self.resolved_targets, self.ignore
             ):
-                flush_activation_qparams(module)
+                write_activation_qparams(module)
 
         if event.type_ == EventType.CALIBRATION_EPOCH_END:
             if not self.ended_:
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -10,7 +10,7 @@
 from compressed_tensors.utils import align_module_device, update_offload_parameter
 from llmcompressor.observers.helpers import flatten_for_calibration
 
-__all__ = ["Observer", "MinMaxTuple", "ScaleZpTuple", "calibrate_module_from_observer"]
+__all__ = ["Observer", "MinMaxTuple", "ScaleZpTuple", "update_module_qparams_from_observer"]
 
 MinMaxTuple = Tuple[torch.Tensor, torch.Tensor]
 ScaleZpTuple = Tuple[torch.Tensor, torch.Tensor]
@@ -127,12 +127,14 @@ def clear_accumulated_stats(self):
     @torch.no_grad
     def forward(self, observed: torch.Tensor) -> ScaleZpTuple:
         """
-        Calculate updated scales and zero points from observed value
-        (weight, activation, or attention state).
+        Accumulate running statistics from the observed value and update
+        deferred min/max. Qparams (scale/zero_point) are not computed here;
+        they are written once at epoch end via update_module_qparams_from_observer.
 
         :param observed: value being observed
-        :return: calibrated scale and zero point
+        :return: calibrated scale and zero point (from accumulated stats)
         """
+        self.update_deferred_stats(observed)
         scales, zero_points, _min, _max = self._forward_with_minmax(observed)
         return (scales, zero_points)
 
@@ -195,7 +197,7 @@ def _check_has_global_scale(self, global_scale: Optional[torch.nn.Parameter]):
 
 
 @torch.no_grad()
-def calibrate_module_from_observer(
+def update_module_qparams_from_observer(
     module: torch.nn.Module,
     base_name: str,
 ) -> bool:
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -160,8 +160,9 @@ def __call__(
                     # propagation pass: modifier hooks are disabled but quantization is
                     # re-enabled so that compressed module outputs are quantized.
                     # This ensures downstream subgraphs receive realistic inputs.
-                    model.apply(enable_quantization)
-                    with HooksMixin.disable_hooks():
+                    with contextlib.ExitStack() as prop_stack:
+                        prop_stack.enter_context(HooksMixin.disable_hooks())
+                        model.apply(enable_quantization)
                         for batch_idx, inputs in _get_batches(
                             activations,
                             num_batches,
@@ -173,6 +174,7 @@ def __call__(
                             if subgraph_index < num_subgraphs - 1:
                                 activations.update(batch_idx, output)
                                 activations.delete(batch_idx, subgraph.consumed_names)
+                    # restore disabled quantization for next calibration pass
                     model.apply(disable_quantization)
 
             # redundant, finish any remaining compression