Revert "expand observers to calculate gparams, add example for activa… (#1486)

dsikka · web-flow · commit fd71824049aa · 2025-05-28T22:03:43.000-04:00
…tions" This reverts commit 830c904. SUMMARY: "please provide a brief summary" TEST PLAN: "please outline how the changes were tested"
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -5,7 +5,6 @@
     KVCacheScaleType,
     QuantizationScheme,
     QuantizationStatus,
-    QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
@@ -85,48 +84,14 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
                 "Must provide a value to observe if not using weight observer"
             )
 
-        quantization_scheme = getattr(module, "quantization_scheme", None)
-        should_calculate_gparam = False
-        should_calculate_qparams = True
-
-        # TODO: will update to be the case for both weight and input in a follow-up
-        # weight global calculate is currently done in ct right now; s
-        # should be moved here to unify global scale calculations
-        if (
-            quantization_scheme.strategy == QuantizationStrategy.TENSOR_GROUP
-            and base_name == "input"
-        ):
-            should_calculate_gparam = True
-            should_calculate_qparams = False
-
         observer = getattr(module, f"{base_name}_observer")
-        observer_outputs = observer(
-            value,
-            g_idx=g_idx,
-            global_scale=global_scale,
-            should_calculate_gparam=should_calculate_gparam,
-            should_calculate_qparams=should_calculate_qparams,
+        updated_scale, updated_zero_point = observer(
+            value, g_idx=g_idx, global_scale=global_scale
         )
 
-        if should_calculate_qparams:
-            if should_calculate_gparam:
-                updated_scale, updated_zero_point, updated_global_scale = (
-                    observer_outputs
-                )
-            else:
-                updated_scale, updated_zero_point = observer_outputs
-        else:
-            updated_global_scale = observer_outputs
-
-        if should_calculate_gparam:
-            update_parameter_data(
-                module, updated_global_scale, f"{base_name}_global_scale"
-            )
-
-        if should_calculate_qparams:
-            # update scale and zero point
-            update_parameter_data(module, updated_scale, f"{base_name}_scale")
-            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
+        # update scale and zero point
+        update_parameter_data(module, updated_scale, f"{base_name}_scale")
+        update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
 
 
 def update_weight_zp_scale(module: Module):
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -2,7 +2,6 @@
 
 import torch
 from compressed_tensors.quantization import (
-    DynamicType,
     QuantizationArgs,
     QuantizationConfig,
     QuantizationScheme,
@@ -213,10 +212,7 @@ def _initialize_observers(self, module: torch.nn.Module):
             return
 
         scheme: QuantizationScheme = module.quantization_scheme
-        input = scheme.input_activations and scheme.input_activations.dynamic in (
-            False,
-            DynamicType.LOCAL,
-        )
+        input = scheme.input_activations and not scheme.input_activations.dynamic
         weight = scheme.weights is not None
         output = scheme.output_activations and not scheme.output_activations.dynamic
         is_attention = is_attention_module(module)
@@ -245,10 +241,7 @@ def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
                 continue
 
             scheme: QuantizationScheme = module.quantization_scheme
-            input = scheme.input_activations and scheme.input_activations.dynamic in (
-                False,
-                DynamicType.LOCAL,
-            )
+            input = scheme.input_activations and not scheme.input_activations.dynamic
             output = scheme.output_activations and not scheme.output_activations.dynamic
             is_attention = is_attention_module(module)
 
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -73,14 +73,11 @@ def post_calculate_qparams(self) -> None:
         Run any logic specific to its observers after running calculate_qparams
         """
 
-    # TODO: use a different name?
     def get_qparams(
         self,
         observed: Optional[Tensor] = None,
         g_idx: Optional[Tensor] = None,
         global_scale: Optional[Tensor] = None,
-        should_calculate_gparam: bool = False,
-        should_calculate_qparams: bool = True,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Convenience function to wrap overwritten calculate_qparams
@@ -104,14 +101,6 @@ def get_qparams(
                 QuantizationStrategy.TENSOR_GROUP,
                 QuantizationStrategy.GROUP,
             ):
-                # Global params are for the entire tensor
-                if should_calculate_gparam:
-                    return self.calculate_qparams(
-                        observed,
-                        should_calculate_gparam=True,
-                        should_calculate_qparams=False,
-                    )
-
                 rows = observed.shape[0]
                 columns = observed.shape[1]
                 num_groups = int(ceil(columns / group_size))
@@ -148,7 +137,7 @@ def get_qparams(
                         observed[:, start:end],
                         0,
                         tensor_id=group_index,
-                        global_scale=global_scale,
+                        global_scale=global_scale
                     )
 
                     self._scale[:, group_index] = scale.squeeze(1)
diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py
@@ -1,14 +1,8 @@
 from collections import Counter
-from typing import Optional
 
 import torch
-from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
-    FP8_E4M3_DATA,
-    FloatArgs,
-)
 
-__all__ = ["get_observer_token_count", "calculate_gparam"]
+__all__ = ["get_observer_token_count"]
 
 
 def get_observer_token_count(module: torch.nn.Module) -> Counter:
@@ -26,26 +20,3 @@ def get_observer_token_count(module: torch.nn.Module) -> Counter:
                 module._num_observed_tokens
             )
     return token_counts
-
-
-def calculate_gparam(
-    updated_min_val: torch.Tensor,
-    updated_max_val: torch.Tensor,
-    scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
-    quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
-    dtype: Optional[torch.dtype] = torch.float32,
-):
-    """
-    Generate a global scale for an entire tensor (input_tensor).
-    Goal of the scale is to ensure that the quantization (local) scale
-    falls into the approproiate dtype range.
-
-    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
-    attempts to use the entire FP8 dtype range while mapping a per-group max
-    to the FP4 max.
-    """
-    min_vals = torch.min(updated_min_val, torch.zeros_like(updated_min_val))
-    max_vals = torch.max(updated_max_val, torch.zeros_like(updated_max_val))
-    max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-    global_scale = scale_data.max * quant_data.max / max_val_pos
-    return global_scale.to(dtype)
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -6,7 +6,6 @@
 from compressed_tensors.utils import deprecated
 
 from llmcompressor.observers.base import Observer
-from llmcompressor.observers.helpers import calculate_gparam
 
 __all__ = ["MinMaxObserver", "MovingAverageMinMaxObserver"]
 
@@ -36,8 +35,6 @@ def calculate_qparams(
         reduce_dims: Optional[Tuple[int]] = None,
         tensor_id: Optional[Any] = None,
         global_scale: Optional[torch.Tensor] = None,
-        should_calculate_gparam: bool = False,
-        should_calculate_qparams: bool = True,
     ) -> Tuple[torch.FloatTensor, torch.IntTensor]:
         """
         Updates the observed min and max using a moving average smoothed by the
@@ -86,24 +83,13 @@ def calculate_qparams(
         self.min_val[tensor_id] = updated_min_val
         self.max_val[tensor_id] = updated_max_val
 
-        if should_calculate_gparam:
-            global_scale = calculate_gparam(
-                updated_min_val=updated_max_val, updated_max_val=updated_max_val
-            )
-            if not should_calculate_qparams:
-                return global_scale
-
-        scale, zero_point = calculate_qparams(
+        return calculate_qparams(
             min_vals=updated_min_val,
             max_vals=updated_max_val,
             quantization_args=self.quantization_args,
             global_scale=global_scale,
         )
 
-        if should_calculate_gparam:
-            return scale, zero_point, global_scale
-        return scale, zero_point
-
     def get_qparams_along_dim(
         self,
         observed: torch.Tensor,
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
@@ -6,7 +6,6 @@
 from torch import FloatTensor, IntTensor, Tensor
 
 from llmcompressor.observers.base import Observer
-from llmcompressor.observers.helpers import calculate_gparam
 
 __all__ = ["MovingAverageMSEObserver"]
 
@@ -116,8 +115,6 @@ def calculate_qparams(
         reduce_dims: Optional[Tuple[int]] = None,
         tensor_id: Optional[Any] = None,
         global_scale: Optional[torch.Tensor] = None,
-        should_calculate_gparam: bool = False,
-        should_calculate_qparams: bool = True,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Updates the mse-clipped min and max values of the observed tensor using
@@ -152,24 +149,13 @@ def calculate_qparams(
         self.min_val[tensor_id] = updated_min_val
         self.max_val[tensor_id] = updated_max_val
 
-        if should_calculate_gparam:
-            global_scale = calculate_gparam(
-                updated_min_val=updated_max_val, updated_max_val=updated_max_val
-            )
-            if not should_calculate_qparams:
-                return global_scale
-
-        scale, zero_point = calculate_qparams(
+        return calculate_qparams(
             min_vals=updated_min_val,
             max_vals=updated_max_val,
             quantization_args=self.quantization_args,
             global_scale=global_scale,
         )
 
-        if should_calculate_gparam:
-            return scale, zero_point, global_scale
-        return scale, zero_point
-
     def get_qparams_along_dim(
         self,
         observed,
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py
@@ -61,13 +61,13 @@ def infer_quantization_format(
         )
         is_weight_only = len(input_args) == 0 and len(weight_args) > 0
 
-        if (
-            weight_args[0].num_bits == 4
-            and weight_args[0].type == QuantizationType.FLOAT.value
-        ):
-            return CompressionFormat.nvfp4_pack_quantized
-
         if is_weight_only:  # w4a16 and w8a16
+            if (
+                weight_args[0].num_bits == 4
+                and weight_args[0].type == QuantizationType.FLOAT.value
+            ):
+                return CompressionFormat.nvfp4_pack_quantized
+
             is_valid_pack = all(
                 weight_arg.num_bits in [4, 8]
                 and weight_arg.type == QuantizationType.INT.value