[MOE] Add a set of functionalities to support quantization of MOE models (#46)

dbogunowicz · bogunowicz@arrival.com · web-flow · commit 4b790ecd834f · 2024-07-10T22:05:17.000+02:00
* Update base.py

* add token counter

* implemented token counting

* observer to count throughout calibration

* cleanup tests

* avoid circular dep on import

* Update helpers.py

* fix tests

* post rebase fixes

* Update src/compressed_tensors/quantization/observers/helpers.py

---------

Co-authored-by: bogunowicz@arrival.com &lt;bogunowicz@arrival.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -293,6 +293,11 @@ def maybe_calibrate_or_quantize(
     }:
         return value
 
+    if value.numel() == 0:
+        # if the tensor is empty,
+        # skip quantization
+        return value
+
     if args.dynamic:
         # dynamic quantization - get scale and zero point directly from observer
         observer = getattr(module, f"{base_name}_observer")
diff --git a/src/compressed_tensors/quantization/observers/base.py b/src/compressed_tensors/quantization/observers/base.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 from typing import Any, Iterable, Optional, Tuple, Union
 
 import torch
@@ -24,6 +25,9 @@
 from torch.nn import Module
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 __all__ = ["Observer"]
 
 
@@ -39,6 +43,7 @@ def __init__(self, quantization_args: QuantizationArgs):
         super().__init__()
         self._scale = None
         self._zero_point = None
+        self._num_observed_tokens = None
 
     @torch.no_grad()
     def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
@@ -48,6 +53,7 @@ def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
             from
         :return: tuple of scale and zero point based on last observed value
         """
+        self.record_observed_tokens(observed)
         return self.get_qparams(observed=observed)
 
     def calculate_qparams(
@@ -132,3 +138,36 @@ def get_qparams_along_dim(
         return self.calculate_qparams(
             observed, reduce_dims=reduce_dims, tensor_id=tensor_id
         )
+
+    def record_observed_tokens(self, batch_tensor: Tensor):
+        """
+        Counts the number of tokens observed during the
+        forward passes. The count is aggregated in the
+        _num_observed_tokens attribute of the class.
+
+        Note: The batch_tensor is expected to have two dimensions
+            (batch_size * sequence_length, num_features). This is the
+            general shape expected by the forward pass of the expert
+            layers in a MOE model. If the input tensor does not have
+            two dimensions, the _num_observed_tokens attribute will be set
+            to None.
+        """
+        if not isinstance(batch_tensor, Tensor):
+            raise ValueError(f"Expected value to be a tensor, got {type(batch_tensor)}")
+
+        if batch_tensor.ndim != 2:
+            _LOGGER.debug(
+                "The input tensor is expected to have two dimensions "
+                "(batch_size * sequence_length, num_features). "
+                f"The input tensor has {batch_tensor.ndim} dimensions."
+            )
+            return
+
+        if self._num_observed_tokens is None:
+            # initialize the count
+            self._num_observed_tokens = 0
+
+        # batch_tensor (batch_size * sequence_length, num_features)
+        # observed_tokens (batch_size * sequence_length)
+        observed_tokens, _ = batch_tensor.shape
+        self._num_observed_tokens += observed_tokens
diff --git a/src/compressed_tensors/quantization/observers/helpers.py b/src/compressed_tensors/quantization/observers/helpers.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import Counter
 from typing import Tuple
 
 import torch
@@ -23,16 +24,33 @@
 from torch import FloatTensor, IntTensor, Tensor
 
 
-__all__ = ["calculate_qparams", "calculate_range"]
+__all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
+
+
+def get_observer_token_count(module: torch.nn.Module) -> Counter:
+    """
+    Parse the module and return the number of tokens observed by
+    each module's observer.
+
+    :param module: module to parse
+    :return: counter with the number of tokens observed by each observer
+    """
+    token_counts = Counter()
+    for name, module in module.named_modules():
+        if name.endswith(".input_observer"):
+            token_counts[
+                name.replace(".input_observer", "")
+            ] = module._num_observed_tokens
+    return token_counts
 
 
 def calculate_qparams(
     min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
 ) -> Tuple[FloatTensor, IntTensor]:
     """
-    :param min_vals: tensor of min value(s) to caluclate scale(s) and zero point(s)
+    :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
         from
-    :param max_vals: tensor of max value(s) to caluclate scale(s) and zero point(s)
+    :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
         from
     :param quantization_args: settings to quantization
     :return: tuple of the calculated scale(s) and zero point(s)
diff --git a/tests/test_quantization/lifecycle/test_forward.py b/tests/test_quantization/lifecycle/test_forward.py
@@ -57,23 +57,24 @@ def test_maybe_calibrate_or_quantize(create_quantization_scheme, quantization_st
     quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=True)
     layer = Linear(4, 4)
     layer.weight.data *= 100
+
+    dummy_tensor = torch.randn(8, 4)  # (num_tokens, num_features)
     layer.quantization_status = QuantizationStatus(quantization_status)
 
     initialize_module_for_quantization(layer, quantization_scheme)
 
     # only calibration updates the scale and zero-point
     if layer.quantization_status == QuantizationStatus.INITIALIZED:
         out = maybe_calibrate_or_quantize(
-            layer, layer.weight.data, "input", quantization_args
+            layer, dummy_tensor, "input", quantization_args
         )
-        assert torch.allclose(out, layer.weight.data)
+        assert torch.allclose(out, dummy_tensor)
     elif layer.quantization_status == QuantizationStatus.CALIBRATION:
-
         out = maybe_calibrate_or_quantize(
-            layer, layer.weight.data, "input", quantization_args
+            layer, dummy_tensor, "input", quantization_args
         )
-        assert torch.allclose(out, layer.weight.data, atol=0.2)
-
+        assert torch.allclose(out, dummy_tensor, atol=0.2)
+        assert layer.input_observer._num_observed_tokens == dummy_tensor.shape[0]
     elif layer.quantization_status == QuantizationStatus.FROZEN:
         # scale and zero points are empty -- cannot quantize
         with pytest.raises(Exception):
diff --git a/tests/test_quantization/test_observers/__init__.py b/tests/test_quantization/test_observers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/test_quantization/test_observers/test_helpers.py b/tests/test_quantization/test_observers/test_helpers.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from compressed_tensors.quantization import (
+    QuantizationConfig,
+    apply_quantization_config,
+)
+from compressed_tensors.quantization.observers.helpers import get_observer_token_count
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def test_get_observer_token_count():
+    model = AutoModelForCausalLM.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE")
+    tokenizer = AutoTokenizer.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE")
+    model.eval()
+    config = QuantizationConfig(
+        format="fakequant",
+        quantization_status="calibration",
+        config_groups={
+            "group_1": {
+                "input_activations": {
+                    "num_bits": 8,
+                    "type": "int",
+                    "symmetric": False,
+                    "strategy": "tensor",
+                },
+                "targets": ["Linear"],
+            },
+        },
+    )
+    apply_quantization_config(model, config)
+
+    # start calibration
+    calib_list = [
+        "I am a string that",
+        "is used for calibration so",
+        "that your model is",
+        "quantized properly.",
+    ]
+
+    total_num_tokens_observed = 0
+    for calib_sample in calib_list:
+        calib_tensor = tokenizer(calib_sample, return_tensors="pt")
+        _ = model(**calib_tensor)
+        total_num_tokens_observed += len(calib_tensor.input_ids.flatten())
+
+    counter = get_observer_token_count(model)
+
+    # filter out the None values
+    # (tokens, in the appropriate format, that were not observed by the model)
+    counter = {k: v for k, v in counter.items() if v is not None}
+
+    # iterate over all the layers in the model where the token count in the proper
+    # format is has been observed
+    for i in range(model.config.num_hidden_layers):
+        # fetch the tokens observed by the router
+        tokens_observed_by_router = counter.pop(
+            f"model.layers.{i}.block_sparse_moe.gate"
+        )
+        assert tokens_observed_by_router == total_num_tokens_observed
+
+        # fetch the sum of tokens observed by all the experts
+        sum_tokens_observed_by_experts = 0
+        keys_for_this_layer = [
+            k
+            for k in counter.keys()
+            if f"model.layers.{i}.block_sparse_moe.experts" in k
+        ]
+        for key in keys_for_this_layer:
+            sum_tokens_observed_by_experts += counter.pop(key)
+
+        # each Mixtral expert is comprised of 3 linear layers,
+        # so we need to multiply by 3
+        assert (
+            sum_tokens_observed_by_experts
+            == total_num_tokens_observed * model.config.num_experts_per_tok * 3
+        )
+
+    # there are no more information in the counter
+    assert len(counter) == 0
diff --git a/tests/test_quantization/test_observers/test_min_max.py b/tests/test_quantization/test_observers/test_min_max.py
@@ -59,7 +59,7 @@ def test_min_max_observer_value_update():
 
     delta = 1e-6
 
-    # udpate the min, max twice total
+    # update the min, max twice total
     tensors = [
         inp,
         inp,
diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.