fixing a mismatch in 4x4 kernel usecase

chaitanyakamarapu · chaitanyakamarapu · commit 4e3672a99219 · 2025-09-19T10:34:43.000-06:00
diff --git a/src/Dialect/ONNX/Transforms/Decompose.cpp b/src/Dialect/ONNX/Transforms/Decompose.cpp
@@ -1735,6 +1735,55 @@ Value decomposeIntoPhasedConvs(PatternRewriter &rewriter, Location loc,
                        ValueRange{conv2, conv4, conv3, conv1}, 1)
                  : rewriter.create<ONNXConcatOp>(loc, concatOutputType,
                        ValueRange{conv1, conv3, conv4, conv2}, 1);
+    } else if (kernelShape[0] == 4) {
+      Value conv1 = getActivationAppliedToConv(
+          addQDQNodesForActivationIfNeeded(rewriter.create<ONNXConvOp>(loc,
+              convOutputType, input, addDequantizeNodeIfNeeded(weightSlices[3]),
+              bias, mlir::StringAttr(), dilations, group,
+              convKernelShapeArrayAttr,
+              getPadsArrayAttr(kernelShape[0], 1, needWeightsPadding),
+              stridesArrayAttr)),
+          convOutputType);
+      Value conv2 = getActivationAppliedToConv(
+          addQDQNodesForActivationIfNeeded(rewriter.create<ONNXConvOp>(loc,
+              convOutputType, input, addDequantizeNodeIfNeeded(weightSlices[0]),
+              bias, mlir::StringAttr(), dilations, group,
+              convKernelShapeArrayAttr,
+              getPadsArrayAttr(kernelShape[0], 2, needWeightsPadding),
+              stridesArrayAttr)),
+          convOutputType);
+      Value conv3 = getActivationAppliedToConv(
+          addQDQNodesForActivationIfNeeded(rewriter.create<ONNXConvOp>(loc,
+              convOutputType, input, addDequantizeNodeIfNeeded(weightSlices[1]),
+              bias, mlir::StringAttr(), dilations, group,
+              convKernelShapeArrayAttr,
+              getPadsArrayAttr(kernelShape[0], 3, needWeightsPadding),
+              stridesArrayAttr)),
+          convOutputType);
+      Value conv4 = getActivationAppliedToConv(
+          addQDQNodesForActivationIfNeeded(rewriter.create<ONNXConvOp>(loc,
+              convOutputType, input, addDequantizeNodeIfNeeded(weightSlices[2]),
+              bias, mlir::StringAttr(), dilations, group,
+              convKernelShapeArrayAttr,
+              getPadsArrayAttr(kernelShape[0], 4, needWeightsPadding),
+              stridesArrayAttr)),
+          convOutputType);
+      // Four conv outputs are merged in channel dim
+      SmallVector<int64_t> outputShapeOfConcat = {
+          1, convOutputShape[1] * 4, convOutputShape[2], convOutputShape[3]};
+      auto concatOutputType =
+          RankedTensorType::get(outputShapeOfConcat, elementType);
+      // for the case where convtranspose kernel is [4, 4] and with pads [1, 1,
+      // 1, 1] The phased convs output are to be concatenated in the reverse
+      // order. This is observed by looking at the phased conv outputs with
+      // respect to convtranspose output.
+      bool reverseConcatOrder = (needWeightsPadding || (kernelShape[0] == 4));
+      // The concat output will have 4 times the channels of a single conv.
+      conv = (reverseConcatOrder)
+                 ? rewriter.create<ONNXConcatOp>(loc, concatOutputType,
+                       ValueRange{conv2, conv4, conv3, conv1}, 1)
+                 : rewriter.create<ONNXConcatOp>(loc, concatOutputType,
+                       ValueRange{conv1, conv3, conv4, conv2}, 1);
     } else {
       // Combining the 4 phased weights into single weight.
       bool reverseOrder = (kernelShape[0] == 4);
diff --git a/test/mlir/onnx/onnx_decompose_convtranspose_phased_conv.mlir b/test/mlir/onnx/onnx_decompose_convtranspose_phased_conv.mlir
@@ -502,13 +502,15 @@ func.func @test_convtrans_4phase_kernel_shape_44(%arg0: tensor<1x512x8x8xf32>, %
 // CHECK:           %[[VAL_22:.*]] = "onnx.Slice"(%[[VAL_20]], %[[VAL_9]], %[[VAL_8]], %[[VAL_13]], %[[VAL_12]]) : (tensor<512x512x4x4xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<512x512x2x2xf32>
 // CHECK:           %[[VAL_23:.*]] = "onnx.Slice"(%[[VAL_20]], %[[VAL_7]], %[[VAL_6]], %[[VAL_13]], %[[VAL_12]]) : (tensor<512x512x4x4xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<512x512x2x2xf32>
 // CHECK:           %[[VAL_24:.*]] = "onnx.Slice"(%[[VAL_20]], %[[VAL_5]], %[[VAL_4]], %[[VAL_13]], %[[VAL_12]]) : (tensor<512x512x4x4xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<512x512x2x2xf32>
-// CHECK:           %[[VAL_25:.*]] = "onnx.Concat"(%[[VAL_21]], %[[VAL_23]], %[[VAL_22]], %[[VAL_24]]) {axis = 0 : si64} : (tensor<512x512x2x2xf32>, tensor<512x512x2x2xf32>, tensor<512x512x2x2xf32>, tensor<512x512x2x2xf32>) -> tensor<2048x512x2x2xf32>
-// CHECK:           %[[VAL_26:.*]] = "onnx.Concat"(%[[VAL_15]], %[[VAL_15]], %[[VAL_15]], %[[VAL_15]]) {axis = 0 : si64} : (tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<2048xf32>
-// CHECK:           %[[VAL_27:.*]] = "onnx.Conv"(%[[VAL_0]], %[[VAL_25]], %[[VAL_26]]) {auto_pad = "NOTSET", dilations = [1, 1], group = 1 : si64, kernel_shape = [2, 2], pads = [0, 0, 1, 1], strides = [1, 1]} : (tensor<1x512x8x8xf32>, tensor<2048x512x2x2xf32>, tensor<2048xf32>) -> tensor<1x2048x8x8xf32>
-// CHECK:           %[[VAL_28:.*]] = "onnx.Reshape"(%[[VAL_27]], %[[VAL_3]]) {allowzero = 0 : si64} : (tensor<1x2048x8x8xf32>, tensor<5xi64>) -> tensor<2x2x512x8x8xf32>
-// CHECK:           %[[VAL_29:.*]] = "onnx.Transpose"(%[[VAL_28]]) {perm = [2, 3, 0, 4, 1]} : (tensor<2x2x512x8x8xf32>) -> tensor<512x8x2x8x2xf32>
-// CHECK:           %[[VAL_30:.*]] = "onnx.Reshape"(%[[VAL_29]], %[[VAL_2]]) {allowzero = 0 : si64} : (tensor<512x8x2x8x2xf32>, tensor<4xi64>) -> tensor<1x512x16x16xf32>
-// CHECK:           onnx.Return %[[VAL_30]] : tensor<1x512x16x16xf32>
+// CHECK:           %[[VAL_25:.*]] = "onnx.Conv"(%[[VAL_0]], %[[VAL_24]], %[[VAL_15]]) {auto_pad = "NOTSET", dilations = [1, 1], group = 1 : si64, kernel_shape = [2, 2], pads = [0, 0, 1, 1], strides = [1, 1]} : (tensor<1x512x8x8xf32>, tensor<512x512x2x2xf32>, tensor<512xf32>) -> tensor<1x512x8x8xf32>
+// CHECK:           %[[VAL_26:.*]] = "onnx.Conv"(%[[VAL_0]], %[[VAL_21]], %[[VAL_15]]) {auto_pad = "NOTSET", dilations = [1, 1], group = 1 : si64, kernel_shape = [2, 2], pads = [1, 1, 0, 0], strides = [1, 1]} : (tensor<1x512x8x8xf32>, tensor<512x512x2x2xf32>, tensor<512xf32>) -> tensor<1x512x8x8xf32>
+// CHECK:           %[[VAL_27:.*]] = "onnx.Conv"(%[[VAL_0]], %[[VAL_22]], %[[VAL_15]]) {auto_pad = "NOTSET", dilations = [1, 1], group = 1 : si64, kernel_shape = [2, 2], pads = [0, 1, 1, 0], strides = [1, 1]} : (tensor<1x512x8x8xf32>, tensor<512x512x2x2xf32>, tensor<512xf32>) -> tensor<1x512x8x8xf32>
+// CHECK:           %[[VAL_28:.*]] = "onnx.Conv"(%[[VAL_0]], %[[VAL_23]], %[[VAL_15]]) {auto_pad = "NOTSET", dilations = [1, 1], group = 1 : si64, kernel_shape = [2, 2], pads = [1, 0, 0, 1], strides = [1, 1]} : (tensor<1x512x8x8xf32>, tensor<512x512x2x2xf32>, tensor<512xf32>) -> tensor<1x512x8x8xf32>
+// CHECK:           %[[VAL_29:.*]] = "onnx.Concat"(%[[VAL_26]], %[[VAL_28]], %[[VAL_27]], %[[VAL_25]]) {axis = 1 : si64} : (tensor<1x512x8x8xf32>, tensor<1x512x8x8xf32>, tensor<1x512x8x8xf32>, tensor<1x512x8x8xf32>) -> tensor<1x2048x8x8xf32>
+// CHECK:           %[[VAL_30:.*]] = "onnx.Reshape"(%[[VAL_29]], %[[VAL_3]]) {allowzero = 0 : si64} : (tensor<1x2048x8x8xf32>, tensor<5xi64>) -> tensor<2x2x512x8x8xf32>
+// CHECK:           %[[VAL_31:.*]] = "onnx.Transpose"(%[[VAL_30]]) {perm = [2, 3, 0, 4, 1]} : (tensor<2x2x512x8x8xf32>) -> tensor<512x8x2x8x2xf32>
+// CHECK:           %[[VAL_32:.*]] = "onnx.Reshape"(%[[VAL_31]], %[[VAL_2]]) {allowzero = 0 : si64} : (tensor<512x8x2x8x2xf32>, tensor<4xi64>) -> tensor<1x512x16x16xf32>
+// CHECK:           onnx.Return %[[VAL_32]] : tensor<1x512x16x16xf32>
 // CHECK:         }
 }