ermilindwalekar
diff --git a/‎mlir/include/mlir/Dialect/Quant/IR/QuantBase.td‎
Lines changed: 104 additions & 0 deletions b/‎mlir/include/mlir/Dialect/Quant/IR/QuantBase.td‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td‎
Lines changed: 20 additions & 9 deletions b/‎mlir/include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎mlir/include/mlir/Dialect/Quant/IR/QuantTypes.h‎
Lines changed: 130 additions & 0 deletions b/‎mlir/include/mlir/Dialect/Quant/IR/QuantTypes.h‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp‎
Lines changed: 2 additions & 0 deletions b/‎mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/Quant/IR/QuantOps.cpp‎
Lines changed: 2 additions & 2 deletions b/‎mlir/lib/Dialect/Quant/IR/QuantOps.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -279,6 +279,110 @@ def Quant_Dialect : Dialect {
     // Correct. The quantized type now includes 3 scale values, matching the
     // size of dimension 1 of the result tensor.
     %result = quant.qcast %input : tensor<?x3xf32> to tensor<?x3x!quant.uniform<i8:f32:1, {2.0, 3.0, 4.0}>>
+
+    ## Sub-channel quantization integrity
+
+    When type `!quant.uniform` contains sub-channel quantization information,
+    the following rules are enforced.  For efficiency, these rules are actively
+    enforced by the verifiers of `quant` dialect ops, but they must be
+    respected in any context in which the `!quant.uniform` data type is used,
+    such as the header of a `func.func` op, or the input of an arithmetic
+    operation.
+
+    - A quantized type with sub-channel quantization information must be the
+      element type of a tensor container type, and may not occur directly as
+      the data type of a scalar value.
+
+    ```
+    // Incorrect. Type !quant.uniform specifies sub-channel quantization for a
+    // scalar type.
+    %result = quant.qcast %input : f32 to !quant.uniform<i8:f32:{0:1, 1:2}, {{1.0}, {2.0}}>
+
+    // Correct. Type `!quant.uniform` with sub-channel quantization is wrapped
+    // in a `tensor` type.
+    %result = quant.qcast %input : tensor<2x2xf32> to
+                tensor<2x2x!quant.uniform<i8:f32:{0:1, 1:2}, {{1.0}, {2.0}}>>
+    ```
+
+    - The tensor containing the sub-channel quantized type must be ranked.
+
+    ```
+    // Incorrect. Type !quant.uniform specifies sub-channel quantization for a
+    // unranked tensor type.
+    %result = quant.qcast %input : tensor<*xf32> to
+                tensor<*x!quant.uniform<i8:f32:{0:1, 1:2}, {{1.0}, {2.0}}>>
+    ```
+
+    - The axis for which a block size is specified should be valid for a tensor
+    of a given rank. Block sizes can be specified for a subset of axes. 
+    Any unspecified block size for an axis i defaults to the tensor dimension
+    size of that axis (shape(tensor)[i]).
+
+    ```
+    // Incorrect. The block-size is specified for axis 2 which is greater than
+    // the rank of the tensor.
+    %result = quant.qcast %input : tensor<2x2xf32> to
+                tensor<2x2x!quant.uniform<i8:f32:{2:1, 1:2}, {{1.0}, {2.0}}>>
+
+    // Incorrect. The block-size is specified for a negative axis.
+    %result = quant.qcast %input : tensor<2x2xf32> to
+                tensor<2x2x!quant.uniform<i8:f32:{-1:1, 1:2}, {{1.0}, {2.0}}>>
+
+    // Correct. The block size for axis 1 is skipped which should be assumed as
+    // 2, the dim-size of tensor at axis 1.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:3}, {{1.0}, {3.0}}>>
+
+    // Correct. The block size for all the axes are skipped making the
+    // sub-channel type essentially a per-tensor type.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{}, {{1.0}}>>
+    ```
+
+    - Block size for a particular axis should be a positive integer and should
+      be less than the dimension size of the tensor along that axis.
+
+    ```
+    // Incorrect. The block size for axis 0 is -1.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:-1}, {{1.0, 2.0}}>>
+
+    // Incorrect. The block size for axis 0 is 8 which is greater than the
+    // dimension size of tensor at axis 0 (which is 6).
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:8}, {{1.0, 2.0}}>>
+
+    // Correct. The block size for axis 0 is now 3.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:3}, {{1.0}, {2.0}}>>
+    ```
+
+    - shape(tensor) % blockSizes = 0 where blockSizes = [block sizes for
+      axis i in [0, 1, ..., rank(tensor)-1]].
+
+    ```
+    // Incorrect. The block size for axis 0 is 4 and the corresponding
+    // dimension size is 6 and 6 % 4 != 0.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:4}, {{1.0, 2.0}}>>
+
+    // Correct. The block size for axis 0 is now 3 making 6 % 3 = 0.
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:3}, {{1.0}, {2.0}}>>
+    ```
+
+    - shape(scales) = shape(zeroPoints) = shape(tensor) / blockSizes.
+
+    ```
+    // Incorrect. shape(tensor) = [6,2], blockSizes = [3,2], but
+    // shape(scales) is [1,2] which is not equal to [6,2]/[3,2].
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:3}, {{1.0, 2.0}}>>
+
+    // Correct. shape(tensor) = [6,2], blockSizes = [3,2], and
+    // shape(scales) equals [6,2]/[3,2].
+    %result = quant.qcast %input : tensor<6x2xf32> to
+                tensor<6x2x!quant.uniform<i8:f32:{0:3}, {{1.0}, {2.0}}>>
     ```
 
     ## Sub-channel quantization integrity
 
@@ -82,6 +82,21 @@ def UniformQuantizedPerAxisType: DialectType<(type
   }];
 }
 
+def UniformQuantizedSubChannelType
+    : DialectType<(type VarInt:$flags, Type:$storageType, Type:$expressedType,
+          SignedVarInt:$storageTypeMin, SignedVarInt:$storageTypeMax,
+          Array<SignedVarIntList>:$quantizedDimensions,
+          Array<SignedVarIntList>:$blockSizes, DenseElementsAttr:$scales,
+          DenseElementsAttr:$zeroPoints)> {
+  // Note: builder order differs from bytecode.
+  let cBuilder = [{
+      get<$_resultType>(context, flags, storageType, expressedType, scales,
+        zeroPoints, llvm::to_vector(llvm::map_range(quantizedDimensions,
+        [](int64_t dim) { return static_cast<int32_t>(dim);})), blockSizes,
+        storageTypeMin, storageTypeMax)
+  }];
+}
+
 def QuantileQuantizedType: DialectType<(type
   VarInt:$flags,
   Type:$storageType,
@@ -119,16 +134,12 @@ def QuantileQuantizedPerAxisType: DialectType<(type
 /// compatibility with older bytecode.
 
 def QuantDialectTypes : DialectTypes<"Quant"> {
-  let elems = [
-    ReservedOrDead,
-    AnyQuantizedType,
-    AnyQuantizedTypeWithExpressedType,
-    CalibratedQuantizedType,
-    UniformQuantizedType,
-    UniformQuantizedPerAxisType,
+  let elems = [ReservedOrDead, AnyQuantizedType,
+               AnyQuantizedTypeWithExpressedType, CalibratedQuantizedType,
+               UniformQuantizedType, UniformQuantizedPerAxisType,
     QuantileQuantizedType,
-    QuantileQuantizedPerAxisType
-  ];
+    QuantileQuantizedPerAxisType,
+               UniformQuantizedSubChannelType];
 }
 
 #endif // QUANT_BYTECODE
@@ -401,6 +401,136 @@ class UniformQuantizedPerAxisType
   }
 };
 
+/// Represents sub-channel (also known as blockwise quantization).
+///
+/// Syntax synopsis:
+///   UniformQuantizedSubChannelType ::= '!quant.uniform' '<'
+///       storageType ('<' storageMin ':' storageMax '>')? ':'
+///       expressedType ':' BlockSizeInfo ',' ScaleZeroTensor '>'
+///   BlockSizeInfo: '{' '}' | '{' AxisBlock (',' AxisBlock)* '}'
+///   AxisBlock ::= AxisSpec ':' BlockSizeSpec
+///   ScaleZeroTensor ::= ScaleZeroDenseExp | ScaleZeroList
+///   ScaleZeroDenseExp ::= '{' ScaleZeroTensor (',' ScaleZeroTensor)* '}'
+///   ScaleZeroList  ::= ScaleZero (',' ScaleZero)*
+///   ScaleZero ::= Scale (':' ZeroPoint)?
+///
+///   StorageType: 'i'|'u' NumBits
+///   ExpressedType: 'f16', 'f32', 'bf16', 'f64'
+///   AxisSpec: An integer value
+///   BlockSizeSpec: An integer value
+///   Scale: An attribute (usually floating-point value)
+///   ZeroPoint: An attribute (usually integer value)
+class UniformQuantizedSubChannelType
+    : public Type::TypeBase<UniformQuantizedSubChannelType, QuantizedType,
+                            detail::UniformQuantizedSubChannelTypeStorage> {
+public:
+  using Base::Base;
+  using Base::getChecked;
+
+  static constexpr StringLiteral name = "quant.uniform_sub_channel";
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static UniformQuantizedSubChannelType
+  get(unsigned flags, Type storageType, Type expressedType,
+      DenseElementsAttr scales, DenseElementsAttr zeroPoints,
+      ArrayRef<int32_t> quantizedDimensions, ArrayRef<int64_t> blockSizes,
+      int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static UniformQuantizedSubChannelType
+  getChecked(function_ref<InFlightDiagnostic()> emitError, unsigned flags,
+             Type storageType, Type expressedType, DenseElementsAttr scales,
+             DenseElementsAttr zeroPoints,
+             ArrayRef<int32_t> quantizedDimensions,
+             ArrayRef<int64_t> blockSizes, int64_t storageTypeMin,
+             int64_t storageTypeMax);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult
+  verifyInvariants(function_ref<InFlightDiagnostic()> emitError, unsigned flags,
+                   Type storageType, Type expressedType,
+                   DenseElementsAttr scales, DenseElementsAttr zeroPoints,
+                   ArrayRef<int32_t> quantizedDimensions,
+                   ArrayRef<int64_t> blockSizes, int64_t storageTypeMin,
+                   int64_t storageTypeMax);
+
+  /// Gets the quantization scales. The scales are organized in a
+  /// multi-dimensional tensor. The size of each dimension in the scales tensor
+  /// is determined by the number of blocks along the corresponding dimension in
+  /// the quantized data tensor.
+  ///
+  /// For example, if the quantized data tensor has shape [X0, X1, ..., XR-1]
+  /// and the block sizes are [B0, B1, ..., BR-1], then the scales tensor will
+  /// have shape [X0/B0, X1/B1, ..., XR-1/BR-1].
+  ///
+  /// The scale value for a specific element in the quantized data tensor at
+  /// position [i0, i1, ..., iR-1] is determined by accessing the corresponding
+  /// element in the scales tensor at position [i0/B0, i1/B1, ..., iR-1/BR-1].
+  DenseElementsAttr getScales() const;
+
+  /// Gets the quantization zero-points. The zero-points are organized in a
+  /// multi-dimensional tensor. The size of each dimension in the zero-point
+  /// tensor is determined by the number of blocks along the corresponding
+  /// dimension in the quantized data tensor.
+  ///
+  /// For example, if the quantized data tensor has shape [X0, X1, ..., XR-1]
+  /// and the block sizes are [B0, B1, ..., BR-1], then the zero-point tensor
+  /// will have shape [X0/B0, X1/B1, ..., XR-1/BR-1].
+  ///
+  /// The zero-point value for a specific element in the quantized data tensor
+  /// at position [i0, i1, ..., iR-1] is determined by accessing the
+  /// corresponding element in the zero-point tensor at position [i0/B0, i1/B1,
+  /// ..., iR-1/BR-1].
+  DenseElementsAttr getZeroPoints() const;
+
+  /// Gets the quantized dimensions. Each element in the returned list
+  /// represents an axis of the quantized data tensor that has a specified block
+  /// size. The order of elements corresponds to the order of block sizes
+  /// returned by `getBlockSizes()`.
+  ///
+  /// It means that the data tensor is quantized along the `i`-th dimension in
+  /// the returned list using the `i`-th block size from `getBlockSizes()`.
+  ///
+  /// Note that the type expression does not have to specify the block size for
+  /// all axes in the data tensor. Any unspecified block size for an axis `i`
+  /// defaults to the tensor dimension size of that axis.
+  ///
+  /// For example, for a quantized type:
+  /// `tensor<8x4x2x!quant.uniform<i8:f32:{1:2, 0:8}, {{1.0, 2.0}, {3.0, 4.0}}>`
+  ///
+  /// `getQuantizedDimensions()` returns [1, 0].
+  /// `getBlockSizes()` returns [2, 8].
+  ///
+  /// This indicates that:
+  ///  * Axis 1 (second dimension) is quantized with a block size of 2.
+  ///  * Axis 0 (first dimension) is quantized with a block size of 8.
+  ///  Since axis 2 is not specified, it implicitly has a block size equal to
+  ///  the size of the third dimension (which is 2 in this case).
+  ArrayRef<int32_t> getQuantizedDimensions() const;
+
+  /// Gets the block sizes for the quantized dimensions. The `i`-th element in
+  /// the returned list corresponds to the block size for the `i`-th dimension
+  /// in the list returned by `getQuantizedDimensions()`.
+  ///
+  /// See `getQuantizedDimensions()` for more details and examples.
+  ArrayRef<int64_t> getBlockSizes() const;
+
+  /// Gets the block size information. This returns a list of pairs, where each
+  /// pair represents a quantized dimension and its corresponding block size.
+  ///
+  /// For example, for the type:
+  ///  `tensor<8x4x!quant.uniform<i8:f32:{1:2, 0:8}, {{2.0, 3.0}}>`
+  ///
+  /// This method returns:
+  ///  `[(1, 2), (0, 8)]`
+  ///
+  /// This list indicates that axis 1 has a block size of 2, and axis 0 has a
+  /// block size of 8.
+  const SmallVector<std::pair<int32_t, int64_t>> getBlockSizeInfo() const;
+};
+
 /// QuantileQuantizedType derives from UniformQuantizedType and adds to it a
 /// look up table array of quantile values. The type of the data in the look up
 /// table is determined by the quantileType member: supported quantileType types
 
@@ -13,6 +13,8 @@
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 
@@ -122,7 +122,7 @@ LogicalResult verifySubChannelQuantization(
   //
   // Therefore, we explicitly disallow the case where d = 0 to maintain
   // consistency and avoid these issues.
-  if (llvm::is_contained(tensorType.getShape(), 0)) {
+  if (llvm::find(tensorType.getShape(), 0) != tensorType.getShape().end()) {
     return op->emitError() << "tensor dimension size of zero is not allowed "
                               "with sub-channel quantization";
   }
@@ -192,7 +192,7 @@ LogicalResult verifyQuantizationOp(Operation *op, QuantizedType quantizedType,
 void QuantDialect::initialize() {
   addTypes<AnyQuantizedType, CalibratedQuantizedType, UniformQuantizedType,
            UniformQuantizedPerAxisType, QuantileQuantizedType,
-           QuantileQuantizedPerAxisType>();
+           QuantileQuantizedPerAxisType, UniformQuantizedSubChannelType>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/Quant/IR/QuantOps.cpp.inc"