Add zero-point compression for asymmetric quantization

shanjiaz · shanjiaz · commit ac326eede1b3 · 2025-11-18T19:38:32.000Z
Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -124,9 +124,21 @@ def compress(
                     compressed_dict[prefix + key] = value.to(compression_device)
 
             else:
-                # omit saving zero points for symmetric or packed quantization
-                if name.endswith("zero_point") and self._skip_zp(name, names_to_scheme):
-                    continue
+                # omit saving zero points for symmetric quantization
+                if name.endswith("weight_zero_point"):
+                    module_path = name.rsplit(".", 1)[0]
+                    if (
+                        module_path in names_to_scheme
+                        and names_to_scheme[module_path].weights.symmetric
+                    ):
+                        continue
+                    # Call compress_zp if available (for PackedQuantizationCompressor)
+                    if module_path in names_to_scheme and hasattr(self, "compress_zp"):
+                        value = self.compress_zp(
+                            value, names_to_scheme[module_path].weights
+                        )
+                        if value is None:
+                            continue
 
                 if name.endswith("weight_scale") and self._skip_scale():
                     continue
@@ -140,21 +152,6 @@ def _skip_scale(self):
 
         return isinstance(self, NVFP4PackedCompressor)
 
-    def _skip_zp(
-        self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
-    ) -> bool:
-        module_name, zp_name = name.rsplit(".", 1) if "." in name else ("", name)
-        scheme = names_to_scheme[module_name]
-
-        if zp_name == "weight_zero_point":
-            args = scheme.weights
-        if zp_name == "input_zero_point":
-            args = scheme.input_activations
-        if zp_name == "output_zero_point":
-            args = scheme.output_activations
-
-        return args.symmetric
-
     def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
diff --git a/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
@@ -184,6 +184,22 @@ def decompress_weight(
 
         return decompressed_weight
 
+    def compress_zp(
+        self, zero_point: Tensor, quantization_args: Optional[QuantizationArgs] = None
+    ) -> Optional[Tensor]:
+        if zero_point is None or quantization_args.symmetric:
+            return None
+        if zero_point.dtype == torch.int32:
+            return zero_point
+        if quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            return pack_to_int32(
+                zero_point, quantization_args.num_bits, packed_dim=0
+            ).contiguous()
+        return zero_point
+
 
 def pack_to_int32(
     value: torch.Tensor,