[ONNX] Add per-column quantization support for Onnx.QLinearMatMul op (#4080)

vivekkhandelwal1 · web-flow · commit e4a2f8683210 · 2025-03-13T17:54:52.000+05:30
This commit extends the OnnxToTorch Lowering for Onnx.QLinearMatMul op by adding the support for per-column quantization for the input argument `b`. Since the `QuantizedMatmulOp` in the downstream pipeline ("Linalg") does not support the per-column (per-channel) quantization for the input arg `b`, hence we add the support by performing matmul over the dequantized inputs i.e., `a` and `b` and then quantizing the output. Fixes nod-ai/SHARK-ModelDev#916. --------- Signed-off-by: Vivek Khandelwal vivekkhandelwal1424@gmail.com
diff --git a/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp b/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
@@ -556,6 +556,7 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
   patterns.onOp(
       "QLinearMatMul", 1,
       [](OpBinder binder, ConversionPatternRewriter &rewriter) {
+        Location loc = binder.getLoc();
         Torch::ValueTensorType resultType;
         llvm::SmallVector<Value> operands;
         if (binder.tensorOperands(operands, 8) ||
@@ -577,10 +578,10 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
               return false;
           return true;
         };
-        if (!check(aScale) || !check(aZp) || !check(bScale) || !check(bZp) ||
-            !check(cScale) || !check(cScale))
+        if (!check(aScale) || !check(aZp) || !check(cScale) || !check(cZp))
           return rewriter.notifyMatchFailure(
-              binder.op, "not supported for non per-tensor quantization");
+              binder.op, "input `a` and output not supported for non "
+                         "per-tensor quantization");
 
         Value emptyList = rewriter.create<Torch::PrimListConstructOp>(
             binder.getLoc(),
@@ -605,26 +606,117 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
         };
 
         aZp = extract(aZp);
-        bZp = extract(bZp);
         cZp = extract(cZp);
         aScale = extract(aScale);
-        bScale = extract(bScale);
         cScale = extract(cScale);
 
-        auto make = [&rewriter, &binder](Value v, Value scale,
-                                         Value zp) -> Value {
+        auto makePerTensor = [&rewriter, &binder](Value v, Value scale,
+                                                  Value zp) -> Value {
           auto ty = cast<Torch::ValueTensorType>(v.getType());
           auto newTy = getQTorchTypeFromTorchIntType(ty);
           return rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
               binder.getLoc(), newTy, v, scale, zp);
         };
 
-        a = make(a, aScale, aZp);
-        b = make(b, bScale, bZp);
+        // The onnx's QLinearMatMul op allows per-column (per-channel)
+        // quantization only for the "b" tensor.
+        bool isPerColumnQuantization = false;
+        auto bTy = dyn_cast<Torch::ValueTensorType>(b.getType());
+        auto bScaleTy = dyn_cast<Torch::ValueTensorType>(bScale.getType());
+        auto bZpTy = dyn_cast<Torch::ValueTensorType>(bZp.getType());
+        if (!bTy || !bScaleTy || !bZpTy || !bTy.hasSizes() ||
+            !bScaleTy.hasSizes() || !bZpTy.hasSizes())
+          return rewriter.notifyMatchFailure(
+              binder.op, "Expected b, b_scale, and b_zero_point "
+                         "arguments to have sizes");
+        ArrayRef<int64_t> bShape(bTy.getSizes());
+        SmallVector<int64_t> bScaleShape(bScaleTy.getSizes());
+        SmallVector<int64_t> bZpShape(bZpTy.getSizes());
+        if (bScaleShape.size() == 0 ||
+            llvm::all_of(bScaleShape, [](int64_t s) { return s == 1; })) {
+          bZp = extract(bZp);
+          bScale = extract(bScale);
+          b = makePerTensor(b, bScale, bZp);
+        } else if ((bScaleShape.size() == 1 ||
+                    bScaleShape.size() == bShape.size()) &&
+                   bScaleShape.back() != Torch::kUnknownSize &&
+                   bScaleShape.back() == bShape.back()) {
+          // Since the `QuantizedMatmulOp` in the downstream pipeline
+          // ("Linalg") does not support the per-column (per-channel)
+          // quantization for the arg `b`, hence for this particular case we
+          // perform the matmul over the dequantized inputs i.e., `a` and `b`
+          // instead of relying on the downstream pipeline to handle this. This
+          // code can be removed and made similar to the other paths in this
+          // lowering once the per-column (per-channel) quantization support is
+          // added in the downstream pipeline.
+          isPerColumnQuantization = true;
+
+          auto aTy = dyn_cast<Torch::ValueTensorType>(a.getType());
+          if (!aTy || !aTy.hasSizes())
+            return rewriter.notifyMatchFailure(
+                binder.op, "Expected input argument `a` to have sizes");
+
+          // Dequantizing the a
+          // a = a.to(dtype=torch.float32)
+          // a_dequant = (a - a_zero_point) * a_scale
+
+          // Converting the a tensor to float32 type.
+          Value none = rewriter.create<Torch::ConstantNoneOp>(loc);
+          Value cstFalse = rewriter.create<Torch::ConstantBoolOp>(loc, false);
+          Value float32Type = rewriter.create<Torch::ConstantIntOp>(
+              loc, rewriter.getI64IntegerAttr(/*float32Type*/ 6));
+          Type f32aType = rewriter.getType<Torch::ValueTensorType>(
+              aTy.getSizes(), rewriter.getF32Type());
+          a = rewriter.create<Torch::AtenToDtypeOp>(loc, f32aType, a,
+                                                    float32Type,
+                                                    /*non_blocking=*/cstFalse,
+                                                    /*copy=*/cstFalse,
+                                                    /*memory_format=*/none);
+
+          Value cstOne = rewriter.create<Torch::ConstantFloatOp>(
+              loc, rewriter.getF64FloatAttr(1.0));
+          a = rewriter.create<Torch::AtenSubScalarOp>(loc, f32aType, a, aZp,
+                                                      cstOne);
+          a = rewriter.create<Torch::AtenMulScalarOp>(loc, f32aType, a, aScale);
+
+          // Dequantizing the b
+          // Shapes of the inputs are as follows:
+          // b = (B, K, N) or (K, N)
+          // b_scale = (B, 1, N) or (1, N) or (N)
+          // b_zero_point = (B, 1, N) or (1, N) or (N)
+          //
+          // We compute the dequantized `b` as follows:
+          // b = b.to(dtype=torch.float32)
+          // b_dequant = (b - b_zero_point) * b_scale
+
+          // Converting the b tensor to float32 type.
+          Type f32bType = rewriter.getType<Torch::ValueTensorType>(
+              bShape, rewriter.getF32Type());
+          b = rewriter.create<Torch::AtenToDtypeOp>(loc, f32bType, b,
+                                                    float32Type,
+                                                    /*non_blocking=*/cstFalse,
+                                                    /*copy=*/cstFalse,
+                                                    /*memory_format=*/none);
+
+          b = rewriter.create<Torch::AtenSubTensorOp>(loc, f32bType, b, bZp,
+                                                      cstOne);
+          b = rewriter.create<Torch::AtenMulTensorOp>(loc, f32bType, b, bScale);
+        } else {
+          llvm_unreachable(
+              "Unidentified case for quantization for `b` argument of"
+              "Onnx.QLinearMatMul op");
+        }
+
+        if (!isPerColumnQuantization)
+          a = makePerTensor(a, aScale, aZp);
+
+        Type cDtype =
+            isPerColumnQuantization
+                ? cast<Type>(rewriter.getF32Type())
+                : cast<Type>(rewriter.getIntegerType(32, /*issigned=*/true));
 
         auto cTy = rewriter.getType<Torch::ValueTensorType>(
-            resultType.getOptionalSizes(),
-            rewriter.getIntegerType(32, /*issigned=*/true));
+            resultType.getOptionalSizes(), cDtype);
 
         Value c;
         if (cTy.getSizes().size() == 2) {
@@ -633,23 +725,26 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
           c = rewriter.create<Torch::AtenBmmOp>(binder.getLoc(), cTy, a, b);
         }
 
-        cTy = rewriter.getType<Torch::ValueTensorType>(
-            resultType.getOptionalSizes(),
-            rewriter.getType<Torch::QInt32Type>());
+        if (!isPerColumnQuantization) {
+          cTy = rewriter.getType<Torch::ValueTensorType>(
+              resultType.getOptionalSizes(),
+              rewriter.getType<Torch::QInt32Type>());
 
-        Value mmScale = rewriter.create<Torch::AtenMulFloatOp>(
-            binder.getLoc(), rewriter.getType<Torch::FloatType>(), aScale,
-            bScale);
-        Value mmZp = rewriter.create<Torch::ConstantIntOp>(
-            binder.getLoc(), rewriter.getType<Torch::IntType>(),
-            rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0));
-        c = rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
-            binder.getLoc(), cTy, c, mmScale, mmZp);
-        cTy = rewriter.getType<Torch::ValueTensorType>(
-            resultType.getOptionalSizes(), rewriter.getF32Type());
+          Value mmScale = rewriter.create<Torch::AtenMulFloatOp>(
+              binder.getLoc(), rewriter.getType<Torch::FloatType>(), aScale,
+              bScale);
+          Value mmZp = rewriter.create<Torch::ConstantIntOp>(
+              binder.getLoc(), rewriter.getType<Torch::IntType>(),
+              rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0));
+          c = rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
+              binder.getLoc(), cTy, c, mmScale, mmZp);
+          cTy = rewriter.getType<Torch::ValueTensorType>(
+              resultType.getOptionalSizes(), rewriter.getF32Type());
+
+          c = rewriter.create<Torch::AtenDequantizeSelfOp>(binder.getLoc(), cTy,
+                                                           c);
+        }
 
-        c = rewriter.create<Torch::AtenDequantizeSelfOp>(binder.getLoc(), cTy,
-                                                         c);
         cTy = dyn_cast<Torch::ValueTensorType>(
             getQTorchTypeFromTorchIntType(resultType));
         Value dtyVal = rewriter.create<Torch::ConstantIntOp>(
diff --git a/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir b/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir
@@ -251,6 +251,42 @@ func.func @test_qlinearmatmul_3D(%arg0: !torch.vtensor<[2,2,4],ui8>, %arg1: !tor
 
 // -----
 
+// CHECK-LABEL:   func.func @test_qlinearmatmul_per_channel_quantization(
+// CHECK-SAME:                                                           %[[A:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[1,4096],ui8>,
+// CHECK-SAME:                                                           %[[A_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],f32>,
+// CHECK-SAME:                                                           %[[A_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],ui8>,
+// CHECK-SAME:                                                           %[[B:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[4096,1024],si8>,
+// CHECK-SAME:                                                           %[[B_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[1024],f32>,
+// CHECK-SAME:                                                           %[[B_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[1024],si8>,
+// CHECK-SAME:                                                           %[[C_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],f32>,
+// CHECK-SAME:                                                           %[[C_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,1024],ui8>
+func.func @test_qlinearmatmul_per_channel_quantization(%arg0: !torch.vtensor<[1,4096],ui8>, %arg1: !torch.vtensor<[],f32>, %arg2: !torch.vtensor<[],ui8>, %arg3: !torch.vtensor<[4096,1024],si8>, %arg4: !torch.vtensor<[1024],f32>, %arg5: !torch.vtensor<[1024],si8>, %arg6: !torch.vtensor<[],f32>, %arg7: !torch.vtensor<[],ui8>) ->  !torch.vtensor<[1,1024],ui8> attributes {torch.onnx_meta.ir_version = 3 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.opset_versions = {ai.onnx.contrib = 1000 : si64, ai.onnx.ml = 3 : si64, ai.onnx.preview.training = 1 : si64, ai.onnx.training = 1 : si64, com.microsoft = 1 : si64, com.microsoft.experimental = 1 : si64, com.microsoft.nchwc = 1 : si64, com.ms.internal.nhwc = 19 : si64, org.pytorch.aten = 1 : si64}, torch.onnx_meta.producer_name = "onnx.quantize", torch.onnx_meta.producer_version = "0.1.0"} {
+  %0 = torch.operator "onnx.QLinearMatMul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (!torch.vtensor<[1,4096],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>, !torch.vtensor<[4096,1024],si8>, !torch.vtensor<[1024],f32>, !torch.vtensor<[1024],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,1024],ui8>
+  // CHECK: %[[EMPTY:.+]] = torch.prim.ListConstruct  : () -> !torch.list<int>
+  // CHECK: %[[AZP:.+]] = torch.aten.item %[[A_ZERO_POINT]] : !torch.vtensor<[],ui8> -> !torch.int
+  // CHECK: %[[CZP:.+]] = torch.aten.item %[[C_ZERO_POINT]] : !torch.vtensor<[],ui8> -> !torch.int
+  // CHECK: %[[ASCALE:.+]] = torch.aten.item %[[A_SCALE]] : !torch.vtensor<[],f32> -> !torch.float
+  // CHECK: %[[CSCALE:.+]] = torch.aten.item %[[C_SCALE]] : !torch.vtensor<[],f32> -> !torch.float
+  // CHECK: %[[NONE:.*]] = torch.constant.none
+  // CHECK: %[[FALSE:.*]] = torch.constant.bool false
+  // CHECK: %[[F32DTYPE:.*]] = torch.constant.int 6
+  // CHECK: %[[A_F32:.*]] = torch.aten.to.dtype %[[A]], %[[F32DTYPE]], %[[FALSE]], %[[FALSE]], %[[NONE]] : !torch.vtensor<[1,4096],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[1,4096],f32>
+  // CHECK: %[[ALPHA:.*]] = torch.constant.float 1.000000e+00
+  // CHECK: %[[A_MINUS_ZP:.*]] = torch.aten.sub.Scalar %[[A_F32]], %[[AZP]], %[[ALPHA]] : !torch.vtensor<[1,4096],f32>, !torch.int, !torch.float -> !torch.vtensor<[1,4096],f32>
+  // CHECK: %[[LHS:.*]] = torch.aten.mul.Scalar %[[A_MINUS_ZP]], %[[ASCALE]] : !torch.vtensor<[1,4096],f32>, !torch.float -> !torch.vtensor<[1,4096],f32>
+  // CHECK: %[[B_F32:.*]] = torch.aten.to.dtype %[[B]], %[[F32DTYPE]], %[[FALSE]], %[[FALSE]], %[[NONE]] : !torch.vtensor<[4096,1024],si8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,1024],f32>
+  // CHECK: %[[B_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[B_F32]], %[[B_ZERO_POINT]], %[[ALPHA]] : !torch.vtensor<[4096,1024],f32>, !torch.vtensor<[1024],si8>, !torch.float -> !torch.vtensor<[4096,1024],f32>
+  // CHECK: %[[RHS:.*]] = torch.aten.mul.Tensor %[[B_MINUS_ZP]], %[[B_SCALE]] : !torch.vtensor<[4096,1024],f32>, !torch.vtensor<[1024],f32> -> !torch.vtensor<[4096,1024],f32>
+  // CHECK: %[[MM:.*]] = torch.aten.mm %[[LHS]], %[[RHS]] : !torch.vtensor<[1,4096],f32>, !torch.vtensor<[4096,1024],f32> -> !torch.vtensor<[1,1024],f32>
+  // CHECK: %[[DTYPE:.*]] = torch.constant.int 13
+  // CHECK: %[[QO:.*]] = torch.aten.quantize_per_tensor %[[MM]], %[[CSCALE]], %[[CZP]], %[[DTYPE]] : !torch.vtensor<[1,1024],f32>, !torch.float, !torch.int, !torch.int -> !torch.vtensor<[1,1024],!torch.quint8>
+  // CHECK: %[[OUT:.*]] = torch.aten.int_repr %[[QO]] : !torch.vtensor<[1,1024],!torch.quint8> -> !torch.vtensor<[1,1024],ui8>
+  // CHECK: return %[[OUT]] : !torch.vtensor<[1,1024],ui8>
+  return %0 : !torch.vtensor<[1,1024],ui8>
+}
+
+// -----
+
 // CHECK-LABEL: func.func @test_reciprocal
 func.func @test_reciprocal(%arg0: !torch.vtensor<[3,4,5],f32>) -> !torch.vtensor<[3,4,5],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 13 : si64, torch.onnx_meta.producer_name = "backend-test", torch.onnx_meta.producer_version = ""} {
   // CHECK: torch.aten.reciprocal %arg0 : !torch.vtensor<[3,4,5],f32> -> !torch.vtensor<[3,4,5],f32>