[Observer Restructure]: Remove MemoryLess Observer; use helper function for dynamic quantization (#187)

dsikka · web-flow · commit b2abe724a5b9 · 2024-10-11T14:14:53.000-04:00
* remove memoryless observer; use helper function for dynamic quantization

* update init

* clean-up

* update test case

* fix arg

* validation + update name

* update preset schemes; swap condition check
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -18,7 +18,10 @@
 
 import torch
 from compressed_tensors.quantization.cache import QuantizedKVParameterCache
-from compressed_tensors.quantization.observers.helpers import calculate_range
+from compressed_tensors.quantization.observers.helpers import (
+    calculate_range,
+    compute_dynamic_scales_and_zp,
+)
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
@@ -376,9 +379,8 @@ def maybe_calibrate_or_quantize(
     g_idx = getattr(module, "weight_g_idx", None)
 
     if args.dynamic:
-        # dynamic quantization - get scale and zero point directly from observer
-        observer = getattr(module, f"{base_name}_observer")
-        scale, zero_point = observer(value, g_idx=g_idx)
+        # dynamic quantization - no need to invoke observer
+        scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args)
     else:
         # static quantization - get previous scale and zero point from layer
         scale = getattr(module, f"{base_name}_scale")
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -153,12 +153,16 @@ def _initialize_scale_zero_point_observer(
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
 ):
+
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
-    module.register_module(f"{base_name}_observer", observer)
+    # no need to register an observer for dynamic quantization
+    if observer:
+        module.register_module(f"{base_name}_observer", observer)
 
+    # no need to register a scale and zero point for a dynamic quantization
     if quantization_args.dynamic:
-        return  # no need to register a scale and zero point for a dynamic observer
+        return
 
     device = next(module.parameters()).device
     if is_module_offloaded(module):
@@ -173,10 +177,7 @@ def _initialize_scale_zero_point_observer(
             expected_shape = (weight_shape[0], 1)
         elif quantization_args.strategy == QuantizationStrategy.GROUP:
             num_groups = weight_shape[1] // quantization_args.group_size
-            expected_shape = (
-                weight_shape[0],
-                max(num_groups, 1)
-            )
+            expected_shape = (weight_shape[0], max(num_groups, 1))
 
     scale_dtype = module.weight.dtype
     if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
diff --git a/src/compressed_tensors/quantization/observers/__init__.py b/src/compressed_tensors/quantization/observers/__init__.py
@@ -17,6 +17,5 @@
 
 from .helpers import *
 from .base import *
-from .memoryless import *
 from .min_max import *
 from .mse import *
diff --git a/src/compressed_tensors/quantization/observers/helpers.py b/src/compressed_tensors/quantization/observers/helpers.py
@@ -13,18 +13,56 @@
 # limitations under the License.
 
 from collections import Counter
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 from compressed_tensors.quantization.quant_args import (
     FP8_DTYPE,
     QuantizationArgs,
+    QuantizationStrategy,
     QuantizationType,
 )
 from torch import FloatTensor, IntTensor, Tensor
 
 
-__all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
+__all__ = [
+    "calculate_qparams",
+    "get_observer_token_count",
+    "calculate_range",
+    "compute_dynamic_scales_and_zp",
+]
+
+
+def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
+    """
+    Returns the computed scales and zero points for dynamic activation
+    qunatization.
+
+    :param value: tensor to calculate quantization parameters for
+    :param args: quantization args
+    :param reduce_dims: optional tuple of dimensions to reduce along,
+        returned scale and zero point will be shaped (1,) along the
+        reduced dimensions
+    :return: tuple of scale and zero point derived from the observed tensor
+    """
+    if args.strategy == QuantizationStrategy.TOKEN:
+        dim = {1, 2}
+        reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
+    elif args.strategy == QuantizationStrategy.TENSOR:
+        reduce_dims = None
+    else:
+        raise ValueError(
+            f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ",
+            "must be used for dynamic quantization",
+        )
+
+    if not reduce_dims:
+        min_val, max_val = torch.aminmax(value)
+    else:
+        min_val = torch.amin(value, dim=reduce_dims, keepdims=True)
+        max_val = torch.amax(value, dim=reduce_dims, keepdims=True)
+
+    return calculate_qparams(min_val, max_val, args)
 
 
 def get_observer_token_count(module: torch.nn.Module) -> Counter:
diff --git a/src/compressed_tensors/quantization/observers/memoryless.py b/src/compressed_tensors/quantization/observers/memoryless.py
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from enum import Enum
 from typing import Any, Dict, Optional, Union
 
@@ -94,7 +95,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     block_structure: Optional[str] = None
     dynamic: bool = False
     actorder: Union[ActivationOrdering, bool, None] = None
-    observer: str = Field(
+    observer: Optional[str] = Field(
         default="minmax",
         description=(
             "The class to use to compute the quantization param - "
@@ -115,10 +116,10 @@ def get_observer(self):
         """
         from compressed_tensors.quantization.observers.base import Observer
 
+        # No observer required for the dynamic case
         if self.dynamic:
-            # override defualt observer for dynamic, you never want minmax which
-            # keeps state across samples for dynamic
-            self.observer = "memoryless"
+            self.observer = None
+            return self.observer
 
         return Observer.load_from_registry(self.observer, quantization_args=self)
 
@@ -171,6 +172,8 @@ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
         strategy = model.strategy
         group_size = model.group_size
         actorder = model.actorder
+        dynamic = model.dynamic
+        observer = model.observer
 
         # infer strategy
         if strategy is None:
@@ -207,6 +210,27 @@ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
                 "activation ordering"
             )
 
+        if dynamic:
+            if strategy not in (
+                QuantizationStrategy.TOKEN,
+                QuantizationStrategy.TENSOR,
+            ):
+                raise ValueError(
+                    f"One of {QuantizationStrategy.TOKEN} or "
+                    f"{QuantizationStrategy.TENSOR} must be used for dynamic ",
+                    "quantization",
+                )
+            if observer is not None:
+                warnings.warn(
+                    "No observer is used for dynamic quantization, setting to None"
+                )
+                model.observer = None
+
+        # if we have not set an observer and we
+        # are running static quantization, use minmax
+        if not observer and not dynamic:
+            model.observer = "minmax"
+
         # write back modified values
         model.strategy = strategy
         return model
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -122,6 +122,7 @@ def is_preset_scheme(name: str) -> bool:
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
 
@@ -164,6 +165,7 @@ def is_preset_scheme(name: str) -> bool:
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
 
@@ -200,6 +202,7 @@ def is_preset_scheme(name: str) -> bool:
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
 
diff --git a/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py b/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py
@@ -73,7 +73,7 @@ def _test_layer_dynamic_quantization_status(
     # check inputs always have an observer if quantized but never scale/zp
     assert not hasattr(module, "input_scale")
     assert not hasattr(module, "input_zero_point")
-    assert hasattr(module, "input_observer") == inputs
+    assert not hasattr(module, "input_observer")
 
     # check weights always have scale/zp and observer only if not frozen
     assert hasattr(module, "weight_scale") == weights

Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,7 @@ def is_preset_scheme(name: str) -> bool:`
`122`	`122`	`strategy=QuantizationStrategy.TOKEN,`
`123`	`123`	`symmetric=True,`
`124`	`124`	`dynamic=True,`
	`125`	`+ observer=None,`
`125`	`126`	`),`
`126`	`127`	`)`
`127`	`128`
`@@ -164,6 +165,7 @@ def is_preset_scheme(name: str) -> bool:`
`164`	`165`	`strategy=QuantizationStrategy.TOKEN,`
`165`	`166`	`symmetric=True,`
`166`	`167`	`dynamic=True,`
	`168`	`+ observer=None,`
`167`	`169`	`),`
`168`	`170`	`)`
`169`	`171`
`@@ -200,6 +202,7 @@ def is_preset_scheme(name: str) -> bool:`
`200`	`202`	`strategy=QuantizationStrategy.TOKEN,`
`201`	`203`	`symmetric=True,`
`202`	`204`	`dynamic=True,`
	`205`	`+ observer=None,`
`203`	`206`	`),`
`204`	`207`	`)`
`205`	`208`