Sub-channel quantized type implementation (#120172)

sdasgup3 · web-flow · commit 8cc42ae53195 · 2025-03-23T07:37:55.000-05:00
This is an implementation for [RFC: Supporting Sub-Channel Quantization in MLIR](https://discourse.llvm.org/t/rfc-supporting-sub-channel-quantization-in-mlir/82694). In order to make the review process easier, the PR has been divided into the following commit labels: 1. **Add implementation for sub-channel type:** Includes the class design for `UniformQuantizedSubChannelType`, printer/parser and bytecode read/write support. The existing types (per-tensor and per-axis) are unaltered. 2. **Add implementation for sub-channel type:** Lowering of `quant.qcast` and `quant.dcast` operations to Linalg operations. 3. **Adding C/Python Apis:** We first define he C-APIs and build the Python-APIs on top of those. 4. **Add pass to normalize generic ....:** This pass normalizes sub-channel quantized types to per-tensor per-axis types, if possible. A design note: - **Explicitly storing the `quantized_dimensions`, even when they can be derived for ranked tensor.** While it's possible to infer quantized dimensions from the static shape of the scales (or zero-points) tensor for ranked data tensors ([ref](https://discourse.llvm.org/t/rfc-supporting-sub-channel-quantization-in-mlir/82694/3) for background), there are cases where this can lead to ambiguity and issues with round-tripping. ``` Consider the example: tensor<2x4x!quant.uniform<i8:f32:{0:2, 0:2}, {{s00:z00, s01:z01}}>> ``` The shape of the scales tensor is [1, 2], which might suggest that only axis 1 is quantized. While this inference is technically correct, as the block size for axis 0 is a degenerate case (equal to the dimension size), it can cause problems with round-tripping. Therefore, even for ranked tensors, we are explicitly storing the quantized dimensions. Suggestions welcome! PS: I understand that the upcoming holidays may impact your schedule, so please take your time with the review. There's no rush.
diff --git a/mlir/include/mlir-c/Dialect/Quant.h b/mlir/include/mlir-c/Dialect/Quant.h
@@ -172,6 +172,47 @@ mlirUniformQuantizedPerAxisTypeGetQuantizedDimension(MlirType type);
 MLIR_CAPI_EXPORTED bool
 mlirUniformQuantizedPerAxisTypeIsFixedPoint(MlirType type);
 
+//===---------------------------------------------------------------------===//
+// UniformQuantizedSubChannelType
+//===---------------------------------------------------------------------===//
+
+/// Returns `true` if the given type is a UniformQuantizedSubChannel.
+MLIR_CAPI_EXPORTED bool
+mlirTypeIsAUniformQuantizedSubChannelType(MlirType type);
+
+/// Creates a UniformQuantizedSubChannelType with the given parameters.
+///
+/// The type is owned by the context. `scalesAttr` and `zeroPointsAttr` must be
+/// DenseElementsAttrs.  `quantizedDimensions` and `blockSizes`
+/// point to `blockSizeInfoLength` number of elements, describing respectively
+/// the quantization axis and corresponding block size.
+MLIR_CAPI_EXPORTED MlirType mlirUniformQuantizedSubChannelTypeGet(
+    unsigned flags, MlirType storageType, MlirType expressedType,
+    MlirAttribute scalesAttr, MlirAttribute zeroPointsAttr,
+    intptr_t blockSizeInfoLength, int32_t *quantizedDimensions,
+    int64_t *blockSizes, int64_t storageTypeMin, int64_t storageTypeMax);
+
+/// Returns the number of block sizes provided in type.
+MLIR_CAPI_EXPORTED intptr_t
+mlirUniformQuantizedSubChannelTypeGetNumBlockSizes(MlirType type);
+
+/// Returns the quantized dimension at the given position.
+MLIR_CAPI_EXPORTED int32_t
+mlirUniformQuantizedSubChannelTypeGetQuantizedDimension(MlirType type,
+                                                        intptr_t pos);
+
+/// Returns the block size at the given position.
+MLIR_CAPI_EXPORTED int64_t
+mlirUniformQuantizedSubChannelTypeGetBlockSize(MlirType type, intptr_t pos);
+
+/// Returns the scales of the quantized type.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirUniformQuantizedSubChannelTypeGetScales(MlirType type);
+
+/// Returns the zero-points of the quantized type.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirUniformQuantizedSubChannelTypeGetZeroPoints(MlirType type);
+
 //===---------------------------------------------------------------------===//
 // CalibratedQuantizedType
 //===---------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/DialectQuant.cpp b/mlir/lib/Bindings/Python/DialectQuant.cpp
@@ -9,10 +9,11 @@
 #include <cstdint>
 #include <vector>
 
+#include "mlir-c/BuiltinAttributes.h"
 #include "mlir-c/Dialect/Quant.h"
 #include "mlir-c/IR.h"
-#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/Bindings/Python/Nanobind.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 
 namespace nb = nanobind;
 using namespace llvm;
@@ -284,6 +285,79 @@ static void populateDialectQuantSubmodule(const nb::module_ &m) {
       },
       "Fixed point values are real numbers divided by a scale.");
 
+  //===-------------------------------------------------------------------===//
+  // UniformQuantizedSubChannelType
+  //===-------------------------------------------------------------------===//
+  auto uniformQuantizedSubChannelType = mlir_type_subclass(
+      m, "UniformQuantizedSubChannelType",
+      mlirTypeIsAUniformQuantizedSubChannelType, quantizedType.get_class());
+  uniformQuantizedSubChannelType.def_classmethod(
+      "get",
+      [](nb::object cls, unsigned flags, MlirType storageType,
+         MlirType expressedType, MlirAttribute scales, MlirAttribute zeroPoints,
+         std::vector<int32_t> quantizedDimensions,
+         std::vector<int64_t> blockSizes, int64_t storageTypeMin,
+         int64_t storageTypeMax) {
+        return cls(mlirUniformQuantizedSubChannelTypeGet(
+            flags, storageType, expressedType, scales, zeroPoints,
+            static_cast<intptr_t>(blockSizes.size()),
+            quantizedDimensions.data(), blockSizes.data(), storageTypeMin,
+            storageTypeMax));
+      },
+      "Gets an instance of UniformQuantizedSubChannel in the same context as "
+      "the provided storage type.",
+      nb::arg("cls"), nb::arg("flags"), nb::arg("storage_type"),
+      nb::arg("expressed_type"), nb::arg("scales"), nb::arg("zero_points"),
+      nb::arg("quantized_dimensions"), nb::arg("block_sizes"),
+      nb::arg("storage_type_min"), nb::arg("storage_type_max"));
+  uniformQuantizedSubChannelType.def_property_readonly(
+      "quantized_dimensions",
+      [](MlirType type) {
+        intptr_t nDim =
+            mlirUniformQuantizedSubChannelTypeGetNumBlockSizes(type);
+        std::vector<int32_t> quantizedDimensions;
+        quantizedDimensions.reserve(nDim);
+        for (intptr_t i = 0; i < nDim; ++i) {
+          quantizedDimensions.push_back(
+              mlirUniformQuantizedSubChannelTypeGetQuantizedDimension(type, i));
+        }
+        return quantizedDimensions;
+      },
+      "Gets the quantized dimensions. Each element in the returned list "
+      "represents an axis of the quantized data tensor that has a specified "
+      "block size. The order of elements corresponds to the order of block "
+      "sizes returned by 'block_sizes' method. It means that the data tensor "
+      "is quantized along the i-th dimension in the returned list using the "
+      "i-th block size from block_sizes method.");
+  uniformQuantizedSubChannelType.def_property_readonly(
+      "block_sizes",
+      [](MlirType type) {
+        intptr_t nDim =
+            mlirUniformQuantizedSubChannelTypeGetNumBlockSizes(type);
+        std::vector<int64_t> blockSizes;
+        blockSizes.reserve(nDim);
+        for (intptr_t i = 0; i < nDim; ++i) {
+          blockSizes.push_back(
+              mlirUniformQuantizedSubChannelTypeGetBlockSize(type, i));
+        }
+        return blockSizes;
+      },
+      "Gets the block sizes for the quantized dimensions. The i-th element in "
+      "the returned list corresponds to the block size for the i-th dimension "
+      "in the list returned by quantized_dimensions method.");
+  uniformQuantizedSubChannelType.def_property_readonly(
+      "scales",
+      [](MlirType type) -> MlirAttribute {
+        return mlirUniformQuantizedSubChannelTypeGetScales(type);
+      },
+      "The scales of the quantized type.");
+  uniformQuantizedSubChannelType.def_property_readonly(
+      "zero_points",
+      [](MlirType type) -> MlirAttribute {
+        return mlirUniformQuantizedSubChannelTypeGetZeroPoints(type);
+      },
+      "The zero points of the quantized type.");
+
   //===-------------------------------------------------------------------===//
   // CalibratedQuantizedType
   //===-------------------------------------------------------------------===//
diff --git a/mlir/lib/CAPI/Dialect/Quant.cpp b/mlir/lib/CAPI/Dialect/Quant.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir-c/Dialect/Quant.h"
+#include "mlir-c/BuiltinAttributes.h"
 #include "mlir/CAPI/Registration.h"
 #include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"
@@ -194,6 +195,61 @@ bool mlirUniformQuantizedPerAxisTypeIsFixedPoint(MlirType type) {
   return cast<quant::UniformQuantizedPerAxisType>(unwrap(type)).isFixedPoint();
 }
 
+//===---------------------------------------------------------------------===//
+// UniformQuantizedSubChannelType
+//===---------------------------------------------------------------------===//
+
+bool mlirTypeIsAUniformQuantizedSubChannelType(MlirType type) {
+  return isa<quant::UniformQuantizedSubChannelType>(unwrap(type));
+}
+
+MlirType mlirUniformQuantizedSubChannelTypeGet(
+    unsigned flags, MlirType storageType, MlirType expressedType,
+    MlirAttribute scalesAttr, MlirAttribute zeroPointsAttr, intptr_t nDims,
+    int32_t *quantizedDimensions, int64_t *blockSizes, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  auto scales = dyn_cast<mlir::DenseElementsAttr>(unwrap(scalesAttr));
+  auto zeroPoints = dyn_cast<mlir::DenseElementsAttr>(unwrap(zeroPointsAttr));
+
+  if (!scales || !zeroPoints) {
+    return {};
+  }
+
+  return wrap(quant::UniformQuantizedSubChannelType::get(
+      flags, unwrap(storageType), unwrap(expressedType), scales, zeroPoints,
+      llvm::ArrayRef<int32_t>(quantizedDimensions, nDims),
+      llvm::ArrayRef<int64_t>(blockSizes, nDims), storageTypeMin,
+      storageTypeMax));
+}
+
+intptr_t mlirUniformQuantizedSubChannelTypeGetNumBlockSizes(MlirType type) {
+  return cast<quant::UniformQuantizedSubChannelType>(unwrap(type))
+      .getBlockSizes()
+      .size();
+}
+
+int32_t mlirUniformQuantizedSubChannelTypeGetQuantizedDimension(MlirType type,
+                                                                intptr_t pos) {
+  return cast<quant::UniformQuantizedSubChannelType>(unwrap(type))
+      .getQuantizedDimensions()[pos];
+}
+
+int64_t mlirUniformQuantizedSubChannelTypeGetBlockSize(MlirType type,
+                                                       intptr_t pos) {
+  return cast<quant::UniformQuantizedSubChannelType>(unwrap(type))
+      .getBlockSizes()[pos];
+}
+
+MlirAttribute mlirUniformQuantizedSubChannelTypeGetScales(MlirType type) {
+  return wrap(
+      cast<quant::UniformQuantizedSubChannelType>(unwrap(type)).getScales());
+}
+
+MlirAttribute mlirUniformQuantizedSubChannelTypeGetZeroPoints(MlirType type) {
+  return wrap(cast<quant::UniformQuantizedSubChannelType>(unwrap(type))
+                  .getZeroPoints());
+}
+
 //===---------------------------------------------------------------------===//
 // CalibratedQuantizedType
 //===---------------------------------------------------------------------===//
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi b/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
@@ -3,7 +3,7 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 
-from mlir.ir import Type
+from mlir.ir import DenseElementsAttr, Type
 
 __all__ = [
   "QuantizedType",
@@ -109,6 +109,26 @@ class UniformQuantizedPerAxisType(QuantizedType):
   @property
   def is_fixed_point(self) -> bool: ...
 
+class UniformQuantizedSubChannelType(QuantizedType):
+
+  @classmethod
+  def get(cls, flags: int, storage_type: Type, expressed_type: Type,
+          scales: DenseElementsAttr, zero_points: DenseElementsAttr,
+          quantized_dimensions: list[int], block_sizes: list[int],
+          storage_type_min: int, storage_type_max: int):
+    ...
+
+  @property
+  def quantized_dimensions(self) -> list[int]: ...
+
+  @property
+  def block_sizes(self) -> list[int]: ...
+
+  @property
+  def scales(self) -> DenseElementsAttr: ...
+
+  @property
+  def zero_points(self) -> DenseElementsAttr: ...
 
 def CalibratedQuantizedType(QuantizedType):