feat: add zero-point decompression support for asymmetric quantization

EtelisIBM · EtelisIBM · commit f0ae89e62a9a · 2025-09-09T19:12:58.000+03:00
- Fix decompress_weight method in PackedQuantizationCompressor to support unpacking zero-points - Add comprehensive tests for zero-point packing/unpacking with GROUP and CHANNEL strategies - Add end-to-end integration tests for asymmetric quantization workflow - Ensure packed tensors are contiguous for safetensors compatibility Resolves issue referenced in vllm-project/llm-compressor#1704
diff --git a/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
@@ -134,16 +134,14 @@ def compress_weight(
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
 
-        # We typically don't compress zp; apart from when using the packed_compressor
-        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
         ]:
             packed_zp = pack_to_int32(
                 zero_point, quantization_args.num_bits, packed_dim=0
             )
-            compressed_dict["weight_zero_point"] = packed_zp
+            compressed_dict["weight_zero_point"] = packed_zp.contiguous()
         return compressed_dict
 
     def decompress_weight(
@@ -166,20 +164,15 @@ def decompress_weight(
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
 
-        # NOTE: this will fail decompression as we don't currently handle packed zp on
-        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
         ]:
-            raise ValueError(
-                "Decompression of packed zero points is currently not supported"
-            )
-            assert zero_point is not None
-            original_zp_shape = (original_shape[0], scale.shape[-1])
-            zero_point = unpack_from_int32(
-                zero_point, num_bits, original_zp_shape, packed_dim=0
-            )
+            if zero_point is not None:
+                original_zp_shape = (original_shape[0], scale.shape[-1])
+                zero_point = unpack_from_int32(
+                    zero_point, num_bits, original_zp_shape, packed_dim=0
+                )
 
         decompressed_weight = dequantize(
             x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
diff --git a/tests/test_compressors/quantized_compressors/test_asymmetric_decompression.py b/tests/test_compressors/quantized_compressors/test_asymmetric_decompression.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+End-to-end tests for asymmetric quantization with zero-point decompression.
+"""
+
+import shutil
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+from compressed_tensors import PackedQuantizationCompressor
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationConfig,
+    QuantizationScheme,
+    QuantizationStrategy,
+    apply_quantization_config,
+)
+from compressed_tensors.quantization.lifecycle.forward import fake_quantize
+from safetensors.torch import save_file
+from torch.nn import Linear, Module, Sequential
+
+
+class SimpleModel(Module):
+    """Simple model for testing"""
+    def __init__(self, input_dim=512, hidden_dim=256, output_dim=128):
+        super().__init__()
+        self.layer1 = Linear(input_dim, hidden_dim, bias=False)
+        self.layer2 = Linear(hidden_dim, output_dim, bias=False)
+    
+    def forward(self, x):
+        x = self.layer1(x)
+        x = torch.relu(x)
+        x = self.layer2(x)
+        return x
+
+
+def create_asymmetric_quant_config(
+    num_bits=4,
+    strategy=QuantizationStrategy.GROUP,
+    group_size=128
+) -> QuantizationConfig:
+    """Create an asymmetric quantization config"""
+    config_groups = {
+        "group_1": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=num_bits,
+                strategy=strategy.value,
+                group_size=group_size if strategy == QuantizationStrategy.GROUP else None,
+                symmetric=False,
+            ),
+        ),
+    }
+    return QuantizationConfig(config_groups=config_groups)
+
+
+@pytest.mark.parametrize(
+    "strategy,group_size",
+    [
+        (QuantizationStrategy.GROUP, 128),
+        (QuantizationStrategy.CHANNEL, None),
+    ],
+)
+def test_end_to_end_asymmetric_quantization(strategy, group_size):
+    """
+    Test end-to-end workflow: quantize -> compress -> save -> load -> decompress -> use
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_path = Path(tmp_dir)
+        
+        model = SimpleModel()
+        original_weights = {
+            "layer1": model.layer1.weight.clone(),
+            "layer2": model.layer2.weight.clone(),
+        }
+        
+        quant_config = create_asymmetric_quant_config(
+            num_bits=4,
+            strategy=strategy,
+            group_size=group_size
+        )
+        apply_quantization_config(model, quant_config)
+        
+        for name, module in model.named_modules():
+            if isinstance(module, Linear):
+                weight = module.weight
+                if strategy == QuantizationStrategy.CHANNEL:
+                    scale_shape = (weight.shape[0], 1)
+                else:
+                    scale_shape = (weight.shape[0], weight.shape[1] // group_size)
+                
+                module.weight_scale = torch.nn.Parameter(
+                    torch.rand(scale_shape) * 0.1,
+                    requires_grad=False
+                )
+                module.weight_zero_point = torch.nn.Parameter(
+                    torch.randint(-8, 8, scale_shape, dtype=torch.int8),
+                    requires_grad=False
+                )
+        
+        compressor = PackedQuantizationCompressor(config=quant_config)
+        quantized_modules_to_scheme = {
+            "layer1": quant_config.config_groups["group_1"],
+            "layer2": quant_config.config_groups["group_1"],
+        }
+        
+        state_dict = model.state_dict()
+        compressed_state_dict = compressor.compress(
+            state_dict, names_to_scheme=quantized_modules_to_scheme
+        )
+        
+        assert "layer1.weight_zero_point" in compressed_state_dict
+        assert "layer2.weight_zero_point" in compressed_state_dict
+        assert compressed_state_dict["layer1.weight_zero_point"].dtype == torch.int32
+        assert compressed_state_dict["layer2.weight_zero_point"].dtype == torch.int32
+        
+        save_file(compressed_state_dict, tmp_path / "model.safetensors")
+        
+        reconstructed_gen = compressor.decompress(
+            tmp_path, names_to_scheme=quantized_modules_to_scheme
+        )
+        
+        reconstructed_weights = {}
+        for module_name, module_data in reconstructed_gen:
+            reconstructed_weights[module_name] = module_data
+        
+        assert "layer1" in reconstructed_weights
+        assert "layer2" in reconstructed_weights
+        assert "weight" in reconstructed_weights["layer1"]
+        assert "weight" in reconstructed_weights["layer2"]
+        
+        assert reconstructed_weights["layer1"]["weight"].shape == original_weights["layer1"].shape
+        assert reconstructed_weights["layer2"]["weight"].shape == original_weights["layer2"].shape
+        
+        new_model = SimpleModel()
+        new_model.layer1.weight.data = reconstructed_weights["layer1"]["weight"]
+        new_model.layer2.weight.data = reconstructed_weights["layer2"]["weight"]
+        
+        test_input = torch.randn(1, 512)
+        with torch.no_grad():
+            output = new_model(test_input)
+        
+        assert output.shape == (1, 128)
+        assert not torch.isnan(output).any()
+        assert not torch.isinf(output).any()
+
+
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_asymmetric_quantization_accuracy(num_bits):
+    """
+    Test that asymmetric quantization with zero-point preserves accuracy better
+    than symmetric quantization for biased weight distributions.
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_path = Path(tmp_dir)
+        
+        shape = (256, 512)
+        weights = torch.randn(shape) + 2.0
+        
+        quant_config = create_asymmetric_quant_config(
+            num_bits=num_bits,
+            strategy=QuantizationStrategy.GROUP,
+            group_size=128
+        )
+        
+        group_size = 128
+        num_groups = shape[1] // group_size
+        scale_shape = (shape[0], num_groups)
+        
+        scales = torch.rand(scale_shape) * 0.1
+        zero_points = torch.randint(-2**(num_bits-1), 2**(num_bits-1), scale_shape, dtype=torch.int8)
+        
+        state_dict = {
+            "layer.weight": weights,
+            "layer.weight_scale": scales,
+            "layer.weight_zero_point": zero_points,
+        }
+        
+        compressor = PackedQuantizationCompressor(config=quant_config)
+        quantized_modules_to_scheme = {"layer": quant_config.config_groups["group_1"]}
+        
+        compressed_state_dict = compressor.compress(
+            state_dict.copy(), names_to_scheme=quantized_modules_to_scheme
+        )
+        
+        save_file(compressed_state_dict, tmp_path / "model.safetensors")
+        
+        reconstructed_gen = compressor.decompress(
+            tmp_path, names_to_scheme=quantized_modules_to_scheme
+        )
+        
+        reconstructed = {}
+        for module_name, module_data in reconstructed_gen:
+            reconstructed[module_name] = module_data
+        
+        assert "layer" in reconstructed
+        assert "weight" in reconstructed["layer"]
+        assert reconstructed["layer"]["weight"].shape == shape
+        
+        decompressed_weights = reconstructed["layer"]["weight"]
+        assert not torch.isnan(decompressed_weights).any()
+        assert not torch.isinf(decompressed_weights).any()
+        
+        assert decompressed_weights.abs().max() < 100
+        assert decompressed_weights.abs().max() > 0.01
+
+
+if __name__ == "__main__":
+    test_end_to_end_asymmetric_quantization(QuantizationStrategy.GROUP, 128)
+    test_end_to_end_asymmetric_quantization(QuantizationStrategy.CHANNEL, None)
+    test_asymmetric_quantization_accuracy(4)
+    test_asymmetric_quantization_accuracy(8)
+    print("All tests passed!")
diff --git a/tests/test_compressors/quantized_compressors/test_pack_quant.py b/tests/test_compressors/quantized_compressors/test_pack_quant.py
@@ -473,3 +473,94 @@ def test_unpack_from_int32(num_bits, values, expected_tensor):
     unpacked_tensor = unpack_from_int32(values, num_bits, expected_tensor.shape)
     assert torch.equal(unpacked_tensor, unpacked_tensor)
     assert unpacked_tensor.dtype == unpacked_tensor.dtype
+
+
+@pytest.mark.parametrize(
+    "strategy,group_size",
+    [
+        (QuantizationStrategy.GROUP, 128),
+        (QuantizationStrategy.CHANNEL, None),
+    ],
+)
+def test_asymmetric_zero_point_decompression(strategy, group_size, tmp_path):
+    """
+    Test that zero-point packing and unpacking works correctly for asymmetric quantization
+    with GROUP and CHANNEL strategies.
+    """
+    shape = (512, 1024)
+    
+    if strategy == QuantizationStrategy.CHANNEL:
+        expected_zp_shape = (shape[0], 1)
+    elif strategy == QuantizationStrategy.GROUP:
+        num_groups = shape[1] // group_size
+        expected_zp_shape = (shape[0], max(num_groups, 1))
+    
+    dense_state_dict = {
+        "dummy.weight": torch.randn(shape),
+        "dummy.weight_scale": torch.rand(expected_zp_shape).to(torch.float32),
+        "dummy.weight_zero_point": torch.randint(-8, 8, expected_zp_shape).to(torch.int8),
+    }
+    
+    quant_config = get_dummy_quant_config(
+        num_bits=4,
+        strategy=strategy.value,
+        symmetric=False,
+        group_size=group_size
+    )
+    
+    compressor = PackedQuantizationCompressor(config=quant_config)
+    quantized_modules_to_scheme = {"dummy": quant_config.config_groups["group_1"]}
+    compressed_state_dict = compressor.compress(
+        dense_state_dict.copy(), names_to_scheme=quantized_modules_to_scheme
+    )
+    
+    assert "dummy.weight_zero_point" in compressed_state_dict
+    assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32
+    
+    save_file(compressed_state_dict, tmp_path / "model.safetensors")
+    
+    reconstructed_dense_gen = compressor.decompress(
+        tmp_path, names_to_scheme=quantized_modules_to_scheme
+    )
+    reconstructed_dense = {}
+    for name, value in reconstructed_dense_gen:
+        reconstructed_dense[name] = value
+    
+    assert "dummy" in reconstructed_dense
+    assert "weight" in reconstructed_dense["dummy"]
+    
+    assert reconstructed_dense["dummy"]["weight"].shape == shape
+    
+    shutil.rmtree(tmp_path)
+
+
+@pytest.mark.parametrize(
+    "num_bits,strategy",
+    [
+        (4, QuantizationStrategy.GROUP),
+        (4, QuantizationStrategy.CHANNEL),
+        (8, QuantizationStrategy.GROUP),
+        (8, QuantizationStrategy.CHANNEL),
+    ],
+)
+def test_zero_point_pack_unpack_consistency(num_bits, strategy):
+    """
+    Test that packing and unpacking zero-points preserves values correctly.
+    """
+    if strategy == QuantizationStrategy.GROUP:
+        shape = (512, 8)
+        group_size = 128
+    else:
+        shape = (512, 1)
+        group_size = None
+    
+    max_val = (1 << (num_bits - 1)) - 1
+    min_val = -(1 << (num_bits - 1))
+    original_zp = torch.randint(min_val, max_val + 1, shape).to(torch.int8)
+    
+    packed_zp = pack_to_int32(original_zp, num_bits, packed_dim=0)
+    
+    unpacked_zp = unpack_from_int32(packed_zp, num_bits, shape, packed_dim=0)
+    
+    assert torch.equal(original_zp, unpacked_zp)
+    assert unpacked_zp.dtype == torch.int8