Add exportable coreml codebook quantization op (#2443)

jerryzh168 · web-flow · commit 3a5819e8dcb5 · 2025-06-27T10:58:20.000-07:00
Summary: Added CoreML codebook quant (Palettization): https://apple.github.io/coremltools/docs-guides/source/opt-palettization-overview.html#palettization-overview * supports group_size `per_grouped_channel` * doesn't support vector quantization yet, but will be easy to turn on if needed * ops added: choose_qparams_and_quantize_codebook, dequantize_codebook * also enabled support for export, these two ops will be preserved after exporta * Added CodebookWeightOnlyConfig(dtype, group_size) that can be used with quantize_ to quantize the Tensor Test Plan: python test/prototype/test_coreml_codebook.py Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/prototype/test_codebook_coreml.py b/test/prototype/test_codebook_coreml.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+
+import torch
+
+from torchao.prototype.quantization.codebook_coreml import (
+    CodebookQuantizedTensor,
+    CodebookWeightOnlyConfig,
+    choose_qparams_and_quantize_codebook_coreml,
+)
+from torchao.quantization import quantize_
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import skip_if_no_cuda
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_6, is_package_at_least
+
+
+@unittest.skipIf(
+    not is_package_at_least("coremltools", "8.3.0"), "Requires coremltools >= 8.3.0"
+)
+class TestCodebookQuantization(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(123)
+        self.input = torch.randn(100, 256, dtype=torch.float32)
+        self.code_dtype = torch.uint8
+        self.block_size = [-1, 4]
+        self.nbits = 8
+
+    def test_choose_qparams_codebook(self):
+        codebook, wq = choose_qparams_and_quantize_codebook_coreml(
+            self.input,
+            self.code_dtype,
+            self.block_size,
+        )
+        group_size = self.block_size[-1]
+        self.assertEqual(codebook.shape, (256 // group_size, 2**self.nbits, 1))
+        self.assertEqual(wq.shape, (100, 256))
+
+        self.assertFalse(torch.isnan(codebook).any())
+        self.assertFalse(torch.isnan(wq).any())
+
+    def test_codebook_quantized_tensor_from_float(self):
+        cqt = CodebookQuantizedTensor.from_float(
+            self.input,
+            self.code_dtype,
+            self.block_size,
+        )
+
+        dequant = cqt.dequantize()
+        sqnr = compute_error(dequant, self.input)
+        self.assertGreater(sqnr, 30)
+
+    def test_codebook_quantized_tensor_from_float2(self):
+        block_size = [-1, 16]
+        code_dtype = torch.uint4
+
+        cqt = CodebookQuantizedTensor.from_float(
+            self.input,
+            code_dtype,
+            block_size,
+        )
+
+        dequant = cqt.dequantize()
+
+        sqnr = compute_error(dequant, self.input)
+        self.assertGreater(sqnr, 18)
+
+    def test_quantize_api(self):
+        m = torch.nn.Sequential(torch.nn.Linear(64, 64))
+        quantize_(
+            m,
+            CodebookWeightOnlyConfig(dtype=self.code_dtype, block_size=self.block_size),
+        )
+        assert type(m[0].weight) == CodebookQuantizedTensor
+
+    @skip_if_no_cuda()
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "requires 2.6+.")
+    def test_export(self):
+        m = torch.nn.Sequential(torch.nn.Linear(128, 64)).to(torch.float32)
+        quantize_(m, CodebookWeightOnlyConfig(self.code_dtype, self.block_size))
+        example_inputs = (torch.randn(1, 128, dtype=torch.float32),)
+        m = torch.export.export(m, example_inputs).module()
+        targets = [n.target for n in m.graph.nodes]
+        self.assertTrue(torch.ops.quant.dequantize_codebook.default in targets)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/prototype/quantization/codebook_coreml/__init__.py b/torchao/prototype/quantization/codebook_coreml/__init__.py
@@ -0,0 +1,13 @@
+from .api import CodebookWeightOnlyConfig
+from .codebook_ops import (
+    choose_qparams_and_quantize_codebook_coreml,
+    dequantize_codebook,
+)
+from .codebook_quantized_tensor import CodebookQuantizedTensor
+
+__all__ = [
+    "CodebookQuantizedTensor",
+    "CodebookWeightOnlyConfig",
+    "choose_qparams_and_quantize_codebook_coreml",
+    "dequantize_codebook",
+]
diff --git a/torchao/prototype/quantization/codebook_coreml/api.py b/torchao/prototype/quantization/codebook_coreml/api.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import List
+
+import torch
+
+from torchao.core.config import AOBaseConfig
+from torchao.prototype.quantization.codebook_coreml.codebook_quantized_tensor import (
+    CodebookQuantizedTensor,
+)
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
+)
+from torchao.utils import is_package_at_least
+
+
+@dataclass
+class CodebookWeightOnlyConfig(AOBaseConfig):
+    dtype: torch.dtype
+    block_size: List[int]
+
+
+@register_quantize_module_handler(CodebookWeightOnlyConfig)
+def _codebook_weight_only_transform(
+    module: torch.nn.Module,
+    config: CodebookWeightOnlyConfig,
+):
+    """
+    Applies codebook weight-only quantization to linear layers.
+
+    Args:
+        dtype: torch.uint1 to torch.uint8, torch.int32 supported.
+    Returns:
+        Callable for quantization transformation.
+    """
+    if not is_package_at_least("coremltools", "8.3.0"):
+        raise ImportError("Requires coremltools >= 8.3.0")
+
+    dtype = config.dtype
+    block_size = config.block_size
+    weight = module.weight
+
+    quantized_weight = CodebookQuantizedTensor.from_float(
+        weight,
+        dtype,
+        block_size,
+    )
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    return module
diff --git a/torchao/prototype/quantization/codebook_coreml/codebook_ops.py b/torchao/prototype/quantization/codebook_coreml/codebook_ops.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional, Tuple
+
+import torch
+
+from torchao.quantization.quant_primitives import (
+    _DTYPE_TO_BIT_WIDTH,
+    _SUB_BYTE_UINT_BOUNDS,
+)
+from torchao.utils import _register_custom_op
+
+quant_lib = torch.library.Library("quant", "FRAGMENT")
+register_custom_op = _register_custom_op(quant_lib)
+
+
+# wrapper around coreml util: https://github.com/apple/coremltools/blob/1c0e5cb1c1e3ab759af107b54f2be18b7c03f8aa/coremltools/models/neural_network/quantization_utils.py#L363
+@torch.no_grad
+@register_custom_op
+def choose_qparams_and_quantize_codebook_coreml(
+    input_tensor: torch.Tensor,
+    code_dtype: torch.dtype,
+    block_size: List[int],
+    force_kmeans1d: bool = False,
+    cluster_dim: int = 1,
+    vector_axis: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Initialize the codebook using k-means clustering on blocks of the input tensor.
+
+    Args:
+        input_tensor (torch.Tensor): The input tensor to be quantized.
+        code_dtype (torch.dtype): The dtype for the codes. [torch.uint1, ..., torch.uint8]
+        block_size (List[int]): the size for how many elements of last dimension of input_tensor
+          belong to the same group and should share the same lookup table. let's say original
+          shape is (N, K), and block_size of (N, group_size) or (-1, group_size),
+          then the slice of (N, group_size) elements should use the same lookup
+          table, and there will be (K // group_size) lookup tables
+        force_kmeans1d (bool): Use kmeans1d regardless of number of weights
+        cluster_dim (int): this means the size of the vector for vector lookup table quantization
+          e.g. when cluster_dim is 4, instead of quantizing each scalar value one by one, we quantize
+          the tensor in a unit of 4 element vectors, a vector of original tensor will be mapped to
+          a vector in the codebook (lookup table) based on the indices.
+        vector_axis (Optional[int]): used in vector quantization, see more docs in https://github.com/apple/coremltools/blob/1c0e5cb1c1e3ab759af107b54f2be18b7c03f8aa/coremltools/optimize/_utils.py#L371
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]  The codebook (lookup table) Tensor and the quantized Tensor (codes, torch.uint8)
+    """
+    assert code_dtype in list(_SUB_BYTE_UINT_BOUNDS.keys()) + [torch.uint8]
+    assert len(block_size) == input_tensor.ndim
+    block_size = block_size.copy()
+    for i in range(input_tensor.ndim - 1):
+        assert block_size[i] == -1 or block_size[i] == input_tensor.shape[i], (
+            f"{block_size} not supported"
+        )
+
+    group_size = block_size[-1]
+    if group_size == -1:
+        group_size = input_tensor.shape[-1]
+
+    assert input_tensor.shape[-1] % group_size == 0
+    assert input_tensor.ndim == 2
+    assert cluster_dim == 1, (
+        f"only cluster_dim == 1 is supported right now, got {cluster_dim}"
+    )
+
+    # for converting to numpy
+    input_tensor = input_tensor.detach()
+    # (N, K)
+    original_shape = input_tensor.shape
+    # (K // group_size)
+    num_lut = input_tensor.shape[1] // group_size
+
+    # reshape to (N, K // group_size, group_size)
+    input_tensor = input_tensor.reshape(input_tensor.shape[0], num_lut, group_size)
+    from coremltools.models.neural_network.quantization_utils import (
+        _get_kmeans_lookup_table_and_weight,
+    )
+
+    nbits = _DTYPE_TO_BIT_WIDTH[code_dtype]
+    if nbits > 8:
+        print(f"Requested nbits: {nbits}, rewriting to 8 bits to reduce the size")
+        nbits = 8
+
+    res_lut = []
+    # each res_w[:, i, :] will use the same lookup table
+    # res_w: (N, K // group_size, group_size)
+    res_w = torch.zeros_like(input_tensor, dtype=torch.uint8)
+    for i in range(num_lut):
+        # lut: (2**nbits, 1)
+        # w: (N * group_size)
+        lut, w = _get_kmeans_lookup_table_and_weight(
+            nbits, input_tensor[:, i, :], force_kmeans1d, cluster_dim, vector_axis
+        )
+        res_lut.append(torch.from_numpy(lut))
+        res_w[:, i, :] = torch.from_numpy(w.reshape(input_tensor.shape[0], group_size))
+
+    # directly stack all lookup tables along dim 0
+    # res_lut: (K // group_size, 2 ** nbits)
+    res_lut = torch.stack(res_lut, dim=0)
+
+    # reshape back to (N, K)
+    res_w = res_w.reshape(*original_shape)
+
+    return res_lut, res_w
+
+
+@register_custom_op
+def dequantize_codebook(
+    codes: torch.Tensor,
+    codebook: torch.Tensor,
+    code_dtype: torch.dtype,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Reconstructs the original tensor from codes and the codebook.
+
+    Args:
+        codes (torch.Tensor): Indices of codebook entries for each element
+                              shape (N, K) for scalar quantization
+        codebook (torch.Tensor): Codebook tensor used for quantization,
+                                 shape (K // group_size, 2 ** nbits) where K is the dim 1 shape of input
+        code_dtype (torch.dtype): The logical dtype for the codes, [torch.uint1, ..., torch.uint8]
+         Note that codes is stored in torch.uint8, this is just addtional information for dequantize op
+        block_size (List[int]): a slice of elements with shape block_size will share the same lookup table
+            only support (-1, ..., group_size) right now (all preceding dimensions has to match input)
+        output_dtype (torch.dtype): dtype for the output tensor.
+
+    Returns:
+        dequant (torch.Tensor): Reconstructed tensor, shape (N, K)
+
+    """
+    assert output_dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported output dtype: {output_dtype}"
+
+    assert code_dtype in list(_SUB_BYTE_UINT_BOUNDS.keys()) + [torch.uint8]
+
+    assert len(block_size) == codes.ndim
+    block_size = block_size.copy()
+    for i in range(codes.ndim - 1):
+        assert block_size[i] == -1 or block_size[i] == codes.shape[i], (
+            f"{block_size} not supported"
+        )
+
+    group_size = block_size[-1]
+    if group_size == -1:
+        group_size = codes.shape[-1]
+
+    assert codes.shape[-1] % group_size == 0
+    K = codes.shape[-1]
+    num_lut = K // group_size
+    # (N, K)
+    original_shape = codes.shape
+
+    # reshape to (N, num_lut, group_size)
+    codes = codes.reshape(codes.shape[0], num_lut, group_size)
+    dequant = torch.zeros_like(codes, dtype=output_dtype)
+
+    # do lookup for each lookup table
+    # dequant shape: (N, num_lut, group_size)
+    # codebook shape: (num_lut, 2 ** nbits)
+    # codes shape: (N, num_lut, group_size)
+    for i in range(num_lut):
+        # dequant[:, i, :]: (N, group_size)
+        # using squeeze to remove the training dim 1s after the lookup
+        dequant[:, i, :] = codebook[i][codes[:, i, :]].squeeze()
+
+    dequant = dequant.reshape(*original_shape)
+    return dequant.to(output_dtype)
diff --git a/torchao/prototype/quantization/codebook_coreml/codebook_quantized_tensor.py b/torchao/prototype/quantization/codebook_coreml/codebook_quantized_tensor.py
diff --git a/torchao/utils.py b/torchao/utils.py