Introduce getExtendZp for both inputZp and outputZp

ShivaChen · ShivaChen · commit d9bcd1ea4b93 · 2025-09-24T05:22:42.000+01:00
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1484,49 +1484,25 @@ static void setupLinalgGenericOpInputAndIndexingMap(
 }
 
 // Return the extended Zp to be used in subsequent arithmetic operations.
-static Value getExtendInputZp(OpBuilder &builder, Type valueTy,
-                              FailureOr<int64_t> maybeZp, Location loc,
-                              ValueRange blockArgs, int64_t iZpArg) {
+static Value getExtendZp(OpBuilder &builder, Type valueTy,
+                         FailureOr<int64_t> maybeZp, Location loc,
+                         ValueRange blockArgs, int64_t zpArg,
+                         bool isOutputZp = false) {
   Value result;
+  const int32_t bitwidth = valueTy.getIntOrFloatBitWidth();
+  const uint32_t attrBitwidth =
+      isOutputZp ? 32 : (bitwidth > 32 ? bitwidth : 32);
+  auto extendType = builder.getIntegerType(attrBitwidth);
   // The Zp value can be either constant or non-constant, depending on
   // whether dynamic extension is enabled.
   // If 'maybeZp' fails, it indicates that Zp is non-constant and will
   // be passed as an input to linalg::GenericOp.
   if (failed(maybeZp)) {
-    result = blockArgs[iZpArg];
+    result = blockArgs[zpArg];
     auto zpTy = result.getType();
-    if (zpTy.getIntOrFloatBitWidth() < 32) {
-      if (zpTy.isUnsignedInteger()) {
-        return builder.create<arith::ExtUIOp>(loc, builder.getI32Type(),
-                                              result);
-      } else {
-        return builder.create<arith::ExtSIOp>(loc, builder.getI32Type(),
-                                              result);
-      }
-    }
-  } else {
-    const int32_t bitwidth = valueTy.getIntOrFloatBitWidth();
-    // Extend zeropoint for sub-32bits widths.
-    const int32_t attrBitwidth = bitwidth > 32 ? bitwidth : 32;
-    return builder.create<arith::ConstantOp>(
-        loc, IntegerAttr::get(builder.getIntegerType(attrBitwidth), *maybeZp));
-  }
-  return result;
-}
-
-// Return the i32 outputZp to be used in subsequent arithmetic operations.
-static Value getI32OutputZp(OpBuilder &builder, Type valueTy,
-                            FailureOr<int64_t> maybeZp, Location loc,
-                            ValueRange blockArgs, int64_t oZpArg) {
-  Value result;
-  // The Zp value can be either constant or non-constant, depending on
-  // whether dynamic extension is enabled.
-  // If 'maybeZp' fails, it indicates that Zp is non-constant and will
-  // be passed as an input to linalg::GenericOp.
-  if (failed(maybeZp)) {
-    result = blockArgs[oZpArg];
-    auto zpTy = result.getType();
-    if (zpTy.getIntOrFloatBitWidth() < 32) {
+    if (zpTy.getIntOrFloatBitWidth() < attrBitwidth) {
+      // For ExtUIOp, the input must be signless.
+      // UnrealizedConversionCastOp will cast the input to signless type.
       if (zpTy.isUnsignedInteger()) {
         result =
             UnrealizedConversionCastOp::create(
@@ -1535,16 +1511,14 @@ static Value getI32OutputZp(OpBuilder &builder, Type valueTy,
                 .getResult(0);
       }
       if (zpTy.isUnsignedInteger()) {
-        return builder.create<arith::ExtUIOp>(loc, builder.getI32Type(),
-                                              result);
+        return builder.create<arith::ExtUIOp>(loc, extendType, result);
       } else {
-        return builder.create<arith::ExtSIOp>(loc, builder.getI32Type(),
-                                              result);
+        return builder.create<arith::ExtSIOp>(loc, extendType, result);
       }
     }
   } else {
     return builder.create<arith::ConstantOp>(
-        loc, IntegerAttr::get(builder.getIntegerType(32), *maybeZp));
+        loc, IntegerAttr::get(extendType, *maybeZp));
   }
   return result;
 }
@@ -1687,12 +1661,12 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
           Type valueTy = value.getType();
 
           FailureOr<int64_t> maybeIZp = op.getInputZeroPoint();
-          auto inputZp = getExtendInputZp(nestedBuilder, valueTy, maybeIZp,
-                                          nestedLoc, blockArgs, iZpArg);
+          auto inputZp = getExtendZp(nestedBuilder, valueTy, maybeIZp,
+                                     nestedLoc, blockArgs, iZpArg);
 
           FailureOr<int64_t> maybeOZp = op.getOutputZeroPoint();
-          auto outputZp = getI32OutputZp(nestedBuilder, valueTy, maybeOZp,
-                                         nestedLoc, blockArgs, oZpArg);
+          auto outputZp = getExtendZp(nestedBuilder, valueTy, maybeOZp,
+                                      nestedLoc, blockArgs, oZpArg, true);
 
           IntegerType outIntType =
               cast<IntegerType>(blockArgs.back().getType());
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -1596,19 +1596,20 @@ func.func @rescale_no_const_per_channel(%arg0 : tensor<2xi8>, %arg1 : tensor<2xi
 
 // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0)>
 // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> ()>
-// CHECK-LABEL: @rescale_no_const_per_channel_output_zp_ui8
+// CHECK-LABEL: @rescale_no_const_per_channel_input_output_zp_ui8
 // CHECK-SAME: ([[ARG0:%[0-9a-zA-Z_]*]]
 // CHECK-SAME:  [[ARG1:%[0-9a-zA-Z_]*]]
 // CHECK-SAME:  [[ARG2:%[0-9a-zA-Z_]*]]
-func.func @rescale_no_const_per_channel_output_zp_ui8(%arg0 : tensor<2xi8>, %arg1 : tensor<2xi32>, %arg2 : tensor<2xi8>, %input_zp : tensor<1xi8>, %output_zp : tensor<1xui8>) -> (tensor<2xui8>) {
-  // CHECK: [[INPUT_ZP:%.+]] = tensor.collapse_shape %arg3 [] : tensor<1xi8> into tensor<i8>
+func.func @rescale_no_const_per_channel_input_output_zp_ui8(%arg0 : tensor<2xi8>, %arg1 : tensor<2xi32>, %arg2 : tensor<2xi8>, %input_zp : tensor<1xui8>, %output_zp : tensor<1xui8>) -> (tensor<2xui8>) {
+  // CHECK: [[INPUT_ZP:%.+]] = tensor.collapse_shape %arg3 [] : tensor<1xui8> into tensor<ui8>
   // CHECK: [[OUTPUT_ZP:%.+]] = tensor.collapse_shape %arg4 [] : tensor<1xui8> into tensor<ui8>
   // CHECK: [[INIT:%.+]] = tensor.empty() : tensor<2xui8>
-  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]], #[[$MAP1]], #[[$MAP1]], #[[$MAP0]]], iterator_types = ["parallel"]} ins([[ARG0]], [[ARG1]], [[ARG2]], [[INPUT_ZP]], [[OUTPUT_ZP]] : tensor<2xi8>, tensor<2xi32>, tensor<2xi8>, tensor<i8>, tensor<ui8>) outs([[INIT]] : tensor<2xui8>) {
-  // CHECK:   ^bb0([[ARG0:%.*]]: i8, [[ARG1:%.*]]: i32, [[ARG2:%.*]]: i8, [[ARG3:%.*]]: i8, [[ARG4:%.*]]: ui8, [[OUT:%.*]]: ui8):
-  // CHECK:    [[INPUT_ZP_I32:%.+]] = arith.extsi [[ARG3]] : i8 to i32
-  // CHECK:    [[INPUT_ZP_I8:%.+]]  = builtin.unrealized_conversion_cast [[ARG4]] : ui8 to i8
-  // CHECK:    [[OUTPUT_ZP_I32:%.+]] = arith.extui [[INPUT_ZP_I8]] : i8 to i32
+  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]], #[[$MAP1]], #[[$MAP1]], #[[$MAP0]]], iterator_types = ["parallel"]} ins([[ARG0]], [[ARG1]], [[ARG2]], [[INPUT_ZP]], [[OUTPUT_ZP]] : tensor<2xi8>, tensor<2xi32>, tensor<2xi8>, tensor<ui8>, tensor<ui8>) outs([[INIT]] : tensor<2xui8>) {
+  // CHECK:   ^bb0([[ARG0:%.*]]: i8, [[ARG1:%.*]]: i32, [[ARG2:%.*]]: i8, [[ARG3:%.*]]: ui8, [[ARG4:%.*]]: ui8, [[OUT:%.*]]: ui8):
+  // CHECK:    [[INPUT_ZP_I8:%.+]]  = builtin.unrealized_conversion_cast [[ARG3]] : ui8 to i8
+  // CHECK:    [[INPUT_ZP_I32:%.+]] = arith.extui [[INPUT_ZP_I8]] : i8 to i32
+  // CHECK:    [[OUTPUT_ZP_I8:%.+]]  = builtin.unrealized_conversion_cast [[ARG4]] : ui8 to i8
+  // CHECK:    [[OUTPUT_ZP_I32:%.+]] = arith.extui [[OUTPUT_ZP_I8]] : i8 to i32
   // CHECK:    [[ARG0_I32:%.+]] = arith.extsi [[ARG0]] : i8 to i32
   // CHECK:    [[TMP1:%.+]] = arith.subi [[ARG0_I32]], [[INPUT_ZP_I32]] : i32
   // CHECK:    [[TMP2:%.+]] = tosa.apply_scale [[TMP1]], [[ARG1]], [[ARG2]] {rounding_mode = DOUBLE_ROUND} : (i32, i32, i8) -> i32
@@ -1617,7 +1618,7 @@ func.func @rescale_no_const_per_channel_output_zp_ui8(%arg0 : tensor<2xi8>, %arg
   // CHECK:    %c255_i32 = arith.constant 255 : i32
   // CHECK:    [[MAX:%.+]] = arith.maxsi %c0_i32, [[TMP3]] : i32
   // CHECK:    [[MIN:%.+]] = arith.minsi %c255_i32, [[MAX]] : i32
-    %0 = tosa.rescale %arg0, %arg1, %arg2, %input_zp, %output_zp {scale32 = true, rounding_mode = DOUBLE_ROUND, per_channel = true, input_unsigned = false, output_unsigned = true} : (tensor<2xi8>, tensor<2xi32>, tensor<2xi8>, tensor<1xi8>, tensor<1xui8>) -> tensor<2xui8>
+    %0 = tosa.rescale %arg0, %arg1, %arg2, %input_zp, %output_zp {scale32 = true, rounding_mode = DOUBLE_ROUND, per_channel = true, input_unsigned = false, output_unsigned = true} : (tensor<2xi8>, tensor<2xi32>, tensor<2xi8>, tensor<1xui8>, tensor<1xui8>) -> tensor<2xui8>
   return %0 : tensor<2xui8>
 }