fix typos

kylesayrs · kylesayrs · commit 0b79d099b80b · 2025-10-07T15:42:06.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py
@@ -65,24 +65,19 @@ def _flatten_weight(
 
     if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
         if g_idx is not None:
-            value = value.index_select(dim=1, index=g_idx)
+            value = value.index_select(dim=1, index=torch.argsort(g_idx))
 
         # (1, num_rows, num_groups, group_size)
         return value.unflatten(-1, (-1, args.group_size)).unsqueeze(0)
 
     if args.strategy == QuantizationStrategy.BLOCK:
         # (1, num_block_rows, num_block_cols, block_width * block_height)
         block_height, block_width = args.block_structure
-        num_rows, num_cols = value.shape
-        num_block_rows = strategy_cdiv(num_rows, block_height, args.strategy)
-        num_block_cols = strategy_cdiv(num_cols, block_width, args.strategy)
+        rows, cols = value.shape
+        block_rows = strategy_cdiv(rows, block_height, args.strategy, strict=True)
+        block_cols = strategy_cdiv(cols, block_width, args.strategy, strict=True)
         return (
-            value.reshape(
-                num_block_rows,
-                block_height,
-                num_block_cols,
-                block_width,
-            )
+            value.reshape(block_rows, block_height, block_cols, block_width)
             .transpose(1, 2)
             .flatten(-2, -1)
             .unsqueeze(0)
@@ -99,7 +94,7 @@ def _flatten_activation(value: torch.Tensor, args: QuantizationArgs):
     if args.strategy == QuantizationStrategy.TOKEN:
         # (batch_size, seq_len, hidden_dim)
         # warning: token quantization uses `compute_dynamic_scales_and_zp`
-        return value.flatten(2, -1)
+        return value
 
     if args.strategy == QuantizationStrategy.CHANNEL:
         raise ValueError("Channel quantization cannot be applied to activations")
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -44,7 +44,13 @@ def get_min_max(self, observed: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso
         if self.min_vals is not None and self.averaging_constant != 1.0:
             # FUTURE: consider scaling by num observations (first dim)
             #         rather than reducing by first dim
-            min_vals = torch.lerp(self.min_vals, min_vals, self.averaging_constant)
-            max_vals = torch.lerp(self.max_vals, max_vals, self.averaging_constant)
+            min_vals = self._lerp(min_vals, self.min_vals, self.averaging_constant)
+            max_vals = self._lerp(max_vals, self.max_vals, self.averaging_constant)
 
         return min_vals, max_vals
+
+    def _lerp(
+        self, input: torch.Tensor, end: torch.Tensor, weight: float
+    ) -> torch.Tensor:
+        """torch lerp_kernel is not implemeneted for all data types"""
+        return (input * weight) + (end * (1.0 - weight))
diff --git a/tests/llmcompressor/observers/test_helpers.py b/tests/llmcompressor/observers/test_helpers.py
@@ -12,98 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 import torch
 from compressed_tensors.quantization import (
-    QuantizationConfig,
-    QuantizationStatus,
-    apply_quantization_config,
+    QuantizationArgs,
+    QuantizationScheme,
+    initialize_module_for_quantization,
 )
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor.modifiers.quantization.calibration import (
-    calibrate_input_hook,
-    initialize_observer,
-)
-from llmcompressor.observers.helpers import get_observer_token_count
-
-
-def _prep_for_input_quant_calibration(module: torch.nn.Module):
-    quantization_scheme = getattr(module, "quantization_scheme", None)
-    if not quantization_scheme:
-        return
-
-    module.register_forward_pre_hook(calibrate_input_hook)
-    module.quantization_status = QuantizationStatus.CALIBRATION
 
+from llmcompressor.observers.helpers import flatten_for_calibration
 
-def test_get_observer_token_count():
-    model = AutoModelForCausalLM.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE")
-    tokenizer = AutoTokenizer.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE")
-    model.eval()
-    config = QuantizationConfig(
-        format="fakequant",
-        quantization_status="calibration",
-        config_groups={
-            "group_1": {
-                "input_activations": {
-                    "num_bits": 8,
-                    "type": "int",
-                    "symmetric": False,
-                    "strategy": "tensor",
-                },
-                "targets": ["Linear"],
-            },
-        },
-    )
-    apply_quantization_config(model, config)
-    model.apply(lambda module: initialize_observer(module, base_name="input"))
-    model.apply(_prep_for_input_quant_calibration)
-
-    # start calibration
-    calib_list = [
-        "I am a string that",
-        "is used for calibration so",
-        "that your model is",
-        "quantized properly.",
-    ]
 
-    total_num_tokens_observed = 0
-    for calib_sample in calib_list:
-        calib_tensor = tokenizer(calib_sample, return_tensors="pt")
-        _ = model(**calib_tensor)
-        total_num_tokens_observed += len(calib_tensor.input_ids.flatten())
+def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor:
+    perm = torch.randperm(columns)
+    return torch.tensor([index // group_size for index in range(columns)])[perm]
 
-    counter = get_observer_token_count(model)
 
-    # filter out the None values
-    # (tokens, in the appropriate format, that were not observed by the model)
-    counter = {k: v for k, v in counter.items() if v is not None}
+@pytest.mark.parametrize(
+    "args",
+    [
+        QuantizationArgs(strategy="tensor"),
+        QuantizationArgs(strategy="tensor_group", group_size=4),
+    ],
+)
+def test_flatten_for_calibration_input(args):
+    module = torch.nn.Linear(8, 10)
+    scheme = QuantizationScheme(targets=[], input_activations=args)
+    initialize_module_for_quantization(module, scheme)
 
-    # iterate over all the layers in the model where the token count in the proper
-    # format is has been observed
-    for i in range(model.config.num_hidden_layers):
-        # fetch the tokens observed by the router
-        tokens_observed_by_router = counter.pop(
-            f"model.layers.{i}.block_sparse_moe.gate"
-        )
-        assert tokens_observed_by_router == total_num_tokens_observed
+    input = torch.empty((3, 5, 8))
+    input_flattened = flatten_for_calibration(input, "input", scheme.input_activations)
+    assert input_flattened.shape[1:-1] == module.input_scale.shape
+    assert input_flattened.shape[1:-1] == module.input_zero_point.shape
 
-        # fetch the sum of tokens observed by all the experts
-        sum_tokens_observed_by_experts = 0
-        keys_for_this_layer = [
-            k
-            for k in counter.keys()
-            if f"model.layers.{i}.block_sparse_moe.experts" in k
-        ]
-        for key in keys_for_this_layer:
-            sum_tokens_observed_by_experts += counter.pop(key)
 
-        # each Mixtral expert is comprised of 3 linear layers,
-        # so we need to multiply by 3
-        assert (
-            sum_tokens_observed_by_experts
-            == total_num_tokens_observed * model.config.num_experts_per_tok * 3
-        )
+@pytest.mark.parametrize(
+    "args,g_idx",
+    [
+        (QuantizationArgs(strategy="tensor"), None),
+        (QuantizationArgs(strategy="channel"), None),
+        (QuantizationArgs(strategy="group", group_size=4), None),
+        (QuantizationArgs(strategy="group", group_size=4), make_dummy_g_idx(8, 4)),
+        (QuantizationArgs(strategy="tensor_group", group_size=4), None),
+        (QuantizationArgs(strategy="block", block_structure=[5, 4]), None),
+    ],
+)
+def test_flatten_for_calibration_weights(args, g_idx):
+    module = torch.nn.Linear(8, 10)
+    scheme = QuantizationScheme(targets=[], weights=args)
+    initialize_module_for_quantization(module, scheme)
 
-    # there are no more information in the counter
-    assert len(counter) == 0
+    weight_flattened = flatten_for_calibration(
+        module.weight,
+        "weight",
+        scheme.weights,
+        g_idx=g_idx,
+    )
+    assert weight_flattened.shape[1:-1] == module.weight_scale.shape
+    assert weight_flattened.shape[1:-1] == module.weight_zero_point.shape
diff --git a/tests/llmcompressor/observers/test_min_max.py b/tests/llmcompressor/observers/test_min_max.py
@@ -82,15 +82,17 @@ def test_min_max_observer_value_update():
 
     tensor = inp
     num_bits = 8
-    weights = QuantizationArgs(num_bits=num_bits, symmetric=True, observer="minmax")
+    weights = QuantizationArgs(
+        num_bits=num_bits, strategy="tensor", symmetric=True, observer="minmax"
+    )
     observer = weights.observer
     observer = Observer.load_from_registry(observer, base_name="weight", args=weights)
     curr_max = 1
     curr_min = 1
     for i, tensor in enumerate(tensors):
         observer(tensor)
-        curr_max = max(observer.max_val.get("default"), curr_max)
-        curr_min = min(observer.min_val.get("default"), curr_max)
+        curr_max = max(observer.max_vals[0], curr_max)
+        curr_min = min(observer.min_vals[0], curr_min)
 
         if i < 2:
             assert curr_max == 1
@@ -108,13 +110,20 @@ def test_g_idx():
     input_shape = (128, 512)
     tensor = torch.rand(input_shape)
     weights = QuantizationArgs(num_bits=8, group_size=group_size, observer="minmax")
+
+    module = torch.nn.Linear(512, 1)
     g_idx = make_dummy_g_idx(tensor.shape[1], group_size)
+    module.weight_g_idx = g_idx
 
-    observer = weights.observer
-    observer = Observer.load_from_registry(observer, base_name="weight", args=weights)
-    scale_g_idx, zero_point_g_idx = observer(tensor, g_idx=g_idx)
+    observer = Observer.load_from_registry(
+        weights.observer, base_name="weight", args=weights, module=module
+    )
+    scale_g_idx, zero_point_g_idx = observer(tensor)
 
-    observer.reset()
+    observer = Observer.load_from_registry(
+        weights.observer, base_name="weight", args=weights, module=module
+    )
+    del module.weight_g_idx
     scale, zero_point = observer(tensor[:, torch.argsort(g_idx)])
 
     assert scale_g_idx == pytest.approx(scale)