fix: use update_deferred_stats for all observer types

dzhengAP · dzhengAP · commit 59b88bf9fbb0 · 2026-03-09T01:25:26.000-07:00
MemorylessMinMaxObserver has no past_min_vals, so get_accumulated_min_max()
always returned None, causing scale to remain 0.

Fix: add update_deferred_stats() to Observer base class which maintains
_deferred_min/_deferred_max independently of subclass implementation.
calibrate_activations(stats_only=True) now calls this instead of observer(value).

Local validation on opt-125m (CPU, 32 calibration samples):
  - 72/72 modules have input_scale
  - Perplexity: 28.86 (FP32) -&gt; 30.78 (INT8), 6.7% degradation
  - No observer stats leaked after calibration
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -191,11 +191,18 @@ def calibrate_activations(
         if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
             calculate_gparam = True
 
-    # In deferred (stats_only) mode, only accumulate running min/max in the
-    # observer — skip writing scale/zero_point until epoch end.
+    # In deferred (stats_only) mode: call the observer to accumulate running
+    # min/max stats but do NOT write scale/zero_point yet.
+    # Qparams are written once at epoch end via flush_activation_qparams.
     if stats_only:
-        calculate_qparams = False
-        calculate_gparam = False
+        # Deferred mode: accumulate global min/max into the observer's
+        # _deferred_min / _deferred_max. Works for ALL observer types,
+        # including MemorylessMinMaxObserver which has no past_min_vals.
+        # Qparams are written once at epoch end via flush_activation_qparams.
+        observer = getattr(module, f"{base_name}_observer", None)
+        if observer is not None:
+            observer.update_deferred_stats(value)
+        return
 
     call_observer(
         module=module,
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -74,31 +74,49 @@ def get_global_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
         """
         raise NotImplementedError()
 
+    def update_deferred_stats(self, observed: torch.Tensor):
+        """
+        Accumulate global min/max from an observed tensor into ``_deferred_min``
+        and ``_deferred_max`` on this observer.
+
+        Called by ``calibrate_activations`` in ``stats_only`` mode for ALL observer
+        types including ``MemorylessMinMaxObserver`` which has no ``past_min_vals``.
+
+        :param observed: activation tensor for this batch
+        """
+        batch_min = observed.float().min()
+        batch_max = observed.float().max()
+
+        if not hasattr(self, "_deferred_min") or self._deferred_min is None:
+            self._deferred_min = batch_min
+            self._deferred_max = batch_max
+        else:
+            self._deferred_min = torch.min(self._deferred_min, batch_min)
+            self._deferred_max = torch.max(self._deferred_max, batch_max)
+
     def get_accumulated_min_max(self) -> Optional[MinMaxTuple]:
         """
-        Return the accumulated running min/max statistics stored by this observer,
-        without performing any new observation. Returns None if no statistics have
-        been accumulated yet (i.e. no batches have been seen).
+        Return accumulated min/max populated by ``update_deferred_stats``.
+        Returns None if no batches have been seen yet.
 
-        Subclasses which accumulate state (StaticMinMax, MovingAverage) naturally
-        expose this through their ``past_min_vals`` / ``past_max_vals`` attributes.
-        Memoryless observers have no running state, so this always returns None.
+        Works for all observer types including ``MemorylessMinMaxObserver``.
 
         :return: (min_vals, max_vals) tensors or None
         """
-        min_vals = getattr(self, "past_min_vals", None)
-        max_vals = getattr(self, "past_max_vals", None)
+        min_vals = getattr(self, "_deferred_min", None)
+        max_vals = getattr(self, "_deferred_max", None)
         if min_vals is None or max_vals is None:
             return None
         return min_vals, max_vals
 
     def clear_accumulated_stats(self):
         """
         Delete accumulated running statistics to free memory after qparams have been
-        computed and written to the parent module. Only clears attributes that exist
-        on the observer (memoryless observers are unaffected).
+        computed and written to the parent module.
         """
         for attr in (
+            "_deferred_min",
+            "_deferred_max",
             "past_min_vals",
             "past_max_vals",
             "past_global_min_vals",