Add INT8 realquant support (#166)

kevalmorabia97 · ishan-modi · kevalmorabia97 · commit bf2b0d028b15 · 2025-05-08T23:46:15.000+05:30
Co-authored-by: ishan-modi &lt;ishan.modi24@gmail.com&gt;
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -37,6 +37,7 @@
     BaseQuantizedTensor,
     FP8QTensor,
     INT4QTensor,
+    INT8QTensor,
     NF4QTensor,
     NVFP4QTensor,
     QTensorWrapper,
@@ -547,6 +548,7 @@ def _is_real_quantize_support(self):
             (self._num_bits == 4 and self._block_sizes)  # NF4 and Int4
             or (self._num_bits == (2, 1) and self._block_sizes)  # NVFP4
             or (self._num_bits == (4, 3))  # FP8
+            or (self._num_bits == 8)  # Int8
         ):
             return True
         return False
@@ -565,6 +567,11 @@ def _real_quantize(self, inputs):
                 scales=self.amax / 448.0 if self.amax is not None else None,
             )
             buffer_to_register["_scale"] = _scale
+        elif self._num_bits == 8:
+            outputs, _scale = INT8QTensor.quantize(
+                inputs, axis=self._axis, block_sizes=self._block_sizes
+            )
+            buffer_to_register["_scale"] = _scale
         elif self._block_sizes.get("scale_bits", 0) == 8 and self._block_sizes.get(
             "scale_block_sizes", None
         ):
diff --git a/modelopt/torch/quantization/qtensor/__init__.py b/modelopt/torch/quantization/qtensor/__init__.py
@@ -20,4 +20,5 @@
 from .base_qtensor import *
 from .fp8_tensor import *
 from .int4_tensor import *
+from .int8_tensor import *
 from .nf4_tensor import *
diff --git a/modelopt/torch/quantization/qtensor/int8_tensor.py b/modelopt/torch/quantization/qtensor/int8_tensor.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements INT8 quantization for efficient tensor storage and computation."""
+
+from typing import Union
+
+import torch
+
+from ..qtensor.base_qtensor import BaseQuantizedTensor
+from ..utils import (
+    convert_quantization_axis_to_reduce_axis,
+    reduce_amax,
+    reduce_block_amax,
+    reduce_block_padding,
+)
+
+
+class INT8QTensor(BaseQuantizedTensor):
+    """Implements the INT8 quantization on tensors for more efficient storage or computation.
+
+    Attributes:
+        quantized_data (torch.Tensor): The quantized data stored as an INT8 tensor.
+    """
+
+    @classmethod
+    def quantize(
+        cls,
+        input: torch.Tensor,
+        scales: torch.Tensor = None,
+        axis: Union[tuple, int, None] = None,
+        block_sizes: dict = None,
+    ) -> tuple:
+        """Converting a tensor to a quantized format based on INT8 quantization.
+
+        Args:
+            input (torch.Tensor): The input tensor to be quantized.
+            scales (torch.Tensor): The scales for quantization.
+            axis: The dimensions to reduce for quantization. None or int or tuple of ints.
+            block_sizes (dict): A dictionary specifying the block size for each dimension.
+        Note: One can only provide axis or block_sizes for INT8 quantization.
+
+        Returns:
+            tuple: INT8QTensor, scales
+        """
+        original_input = input
+        if scales is None:
+            if block_sizes:
+                input = reduce_block_padding(input, block_sizes)
+                amax = reduce_block_amax(input, block_sizes)
+            else:
+                reduce_axis = convert_quantization_axis_to_reduce_axis(input, axis)
+                amax = reduce_amax(input, axis=reduce_axis)
+            scales = amax / 127.0
+
+        # Calculate the scale shape and make sure it aligns with input and block_sizes
+        expected_shape = list(input.shape)
+        expanded_scales = scales.clone()
+        if block_sizes:
+            for dim, block_size in block_sizes.items():
+                dim = dim if dim >= 0 else len(input.shape) + dim  # Convert negative index
+                assert input.shape[dim] % block_size == 0, (
+                    f"Tensor dimension {dim}, {input.shape[dim]} is not divisible by {block_size}."
+                )
+                expected_shape[dim] = (
+                    input.shape[dim] // block_size
+                )  # Adjust expected shape for blocks
+
+            # Assert the shape of `scales` matches expected reduced dimensions
+            assert scales.shape == tuple(expected_shape), (
+                f"Mismatch in expected scale shape: {scales.shape} vs {tuple(expected_shape)}"
+            )
+
+            # Expand scales for broadcasting
+            for dim, block_size in block_sizes.items():
+                expanded_scales = expanded_scales.repeat_interleave(block_size, dim=dim)
+
+        # Quantization
+        quantized_data = (input / expanded_scales).round().clamp(-128, 127).to(torch.int8)
+
+        return cls(original_input.shape, original_input.dtype, quantized_data), scales
+
+    def dequantize(self, dtype: torch.dtype = None, **kwarg):
+        """Dequantize INT8 packed tensor to a target dtype."""
+        if dtype is None:
+            dtype = self.metadata["dtype"]
+        assert "scale" in kwarg, "Require scale for INT8 dequantization."
+
+        # Get args
+        scales = kwarg["scale"]
+        block_sizes = kwarg.get("block_sizes", None)
+
+        shape = self._quantized_data.shape
+        if block_sizes:
+            # Compute expanded shape for broadcasting scales
+            expanded_shape = list(shape)
+            for dim, block_size in block_sizes.items():
+                assert shape[dim] % block_size == 0, (
+                    f"Dimension {shape[dim]} is not divisible by {block_size}."
+                )
+                expanded_shape[dim] //= block_size  # Reduce the dimension size for blocks
+
+            assert tuple(expanded_shape) == scales.shape, (
+                f"Scales shape {scales.shape} must match expected {tuple(expanded_shape)}."
+            )
+
+            # Expand scales for broadcasting
+            for dim, block_size in block_sizes.items():
+                scales = scales.repeat_interleave(block_size, dim=dim)
+
+        # Handle padded tensors
+        slices = tuple(slice(0, dim) for dim in self.metadata["shape"])
+
+        return (self._quantized_data.view(torch.int8).to(dtype) * scales.to(dtype))[slices]
diff --git a/tests/gpu/torch/quantization/test_qtensor_cuda.py b/tests/gpu/torch/quantization/test_qtensor_cuda.py
@@ -146,6 +146,28 @@ def test_amax_from_tensor_quantizer(
                     dtype=torch.bfloat16,
                 ),
             ),
+            # INT8 per channel quantization
+            (
+                8,
+                None,
+                0,
+                torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=torch.bfloat16),
+                torch.tensor(
+                    [[0.0000, 0.9922, 1.9844, 2.9844, 3.9688, 4.9688, 5.9688, 7.0000]],
+                    dtype=torch.bfloat16,
+                ),
+            ),
+            # INT8 2D block quantization
+            (
+                8,
+                {-1: 2, -2: 2},
+                None,
+                torch.tensor([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=torch.bfloat16),
+                torch.tensor(
+                    [[0.0000, 1.0234, 1.9844, 2.9844], [4.0000, 5.0000, 5.9688, 7.0000]],
+                    dtype=torch.bfloat16,
+                ),
+            ),
             # FP8, 2D block scales
             (
                 (4, 3),
diff --git a/tests/gpu/torch/quantization/test_real_quantize_cuda.py b/tests/gpu/torch/quantization/test_real_quantize_cuda.py
@@ -31,6 +31,7 @@
     "config",
     [
         mtq.INT4_AWQ_CFG,
+        mtq.INT8_DEFAULT_CFG,
         mtq.FP8_DEFAULT_CFG,
         mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
         mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG,
@@ -82,6 +83,7 @@ def forward_loop(model):
     "config",
     [
         mtq.INT4_AWQ_CFG,
+        mtq.INT8_DEFAULT_CFG,
         mtq.FP8_DEFAULT_CFG,
         mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
         mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG,
@@ -105,6 +107,7 @@ def test_save_restore(model_cls, config):
     "quant_config",
     [
         mtq.INT4_AWQ_CFG,
+        mtq.INT8_DEFAULT_CFG,
         mtq.FP8_DEFAULT_CFG,
         mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
         mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG,