Merge pull request #382 from Xilinx/jrickert.fix_recompose

jorickert · web-flow · commit 79010746ab45 · 2025-06-30T15:57:11.000+02:00
Fix various bugs in parallel conv recomposition
diff --git a/src/Dialect/ONNX/Transforms/Recompose.cpp b/src/Dialect/ONNX/Transforms/Recompose.cpp
@@ -22,6 +22,7 @@
 
 #include <numeric>
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -75,7 +76,6 @@ ValueRange emitSplitByChannels(PatternRewriter &rewriter, Location loc,
     splitShape[axis] = size;
     resultTypes.push_back(RankedTensorType::get(splitShape, elementType));
   }
-  rewriter.setInsertionPointAfter(input.getDefiningOp());
   // Perform Split Operation
   ValueRange results =
       create.onnx.split(ArrayRef(resultTypes), input, splitConstant, axis);
@@ -1057,6 +1057,10 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
       return rewriter.notifyMatchFailure(
           convOp1, "input must be a ranked tensor with static shape");
 
+    if (!cast<ShapedType>(convOp1.getType()).hasStaticShape())
+      return rewriter.notifyMatchFailure(
+          convOp1, "output type must be a ranked tensor with static shape");
+
     // Collect all ONNXConvOps using this input.
     SmallVector<ONNXConvOp> candidateConvs;
     for (auto user : input.getUsers()) {
@@ -1084,6 +1088,55 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
 
     SmallVector<ONNXConvOp> parallelConvs = candidateConvs;
 
+    SmallVector<Value> weightValues;
+    int64_t totalOutputChannels = 0;
+    for (auto conv : parallelConvs) {
+      auto weightType = mlir::cast<ShapedType>(conv.getW().getType());
+      if (!weightType.hasStaticShape())
+        return rewriter.notifyMatchFailure(
+            conv, "weight must be a ranked tensor with static shape");
+      if (!cast<ShapedType>(conv.getType()).hasStaticShape())
+        return rewriter.notifyMatchFailure(
+            conv, "output type must be a ranked tensor with static shape");
+      weightValues.push_back(conv.getW());
+      totalOutputChannels += weightType.getShape()[0];
+    }
+
+    auto *latestConv =
+        llvm::max_element(parallelConvs, [](ONNXConvOp a, ONNXConvOp b) {
+          return a->isBeforeInBlock(b.getOperation());
+        });
+
+    const auto checkIfOtherConvsReachable = [&](ONNXConvOp conv) {
+      SmallVector<Operation *> worklist;
+      DenseSet<Operation *> visited;
+      worklist.push_back(conv.getOperation());
+      while (!worklist.empty()) {
+        Operation *current = worklist.back();
+        worklist.pop_back();
+
+        for (auto *user : current->getUsers()) {
+          if (auto otherConv = dyn_cast<ONNXConvOp>(user)) {
+            if (llvm::is_contained(parallelConvs, otherConv)) {
+              // Found another conv that is part of the parallel convs.
+              return true;
+            }
+          }
+          if (visited.insert(user).second &&
+              user->isBeforeInBlock(*latestConv)) {
+            worklist.push_back(user);
+          }
+        };
+      }
+      return false;
+    };
+    // Ensure all convolutions are really parallel, none of then can be part of
+    // the input of another convolution
+    if (llvm::any_of(parallelConvs, checkIfOtherConvsReachable)) {
+      return rewriter.notifyMatchFailure(
+          convOp1, "conv ops are not parallel (reachable from each other)");
+    }
+
     bool allHaveBias = !mlir::isa<NoneType>(parallelConvs[0].getB().getType());
 
     Location loc = convOp1.getLoc();
@@ -1097,14 +1150,6 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
 
     int64_t concatAxis = 1;
 
-    SmallVector<Value> weightValues;
-    int64_t totalOutputChannels = 0;
-    for (auto conv : parallelConvs) {
-      auto weightType = mlir::cast<ShapedType>(conv.getW().getType());
-      weightValues.push_back(conv.getW());
-      totalOutputChannels += weightType.getShape()[0];
-    }
-
     auto firstWeightType =
         mlir::cast<ShapedType>(parallelConvs[0].getW().getType());
     SmallVector<int64_t> newWeightShape(
@@ -1137,6 +1182,8 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
     newOutputShape[concatAxis] = totalOutputChannels;
     auto newOutputType = RankedTensorType::get(newOutputShape, elementType);
 
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointAfter(*latestConv);
     auto newConv =
         rewriter.create<ONNXConvOp>(loc, newOutputType, input, newWeight,
             newBias, convOp1.getAutoPadAttr(), convOp1.getDilationsAttr(),
@@ -1171,8 +1218,7 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
 
     if (allOutputsUsedInCommonConcat && commonConcatOp &&
         commonConcatOp.getAxis() == 1) {
-      commonConcatOp.getResult().replaceAllUsesWith(newConv.getResult());
-      rewriter.eraseOp(commonConcatOp);
+      rewriter.replaceOp(commonConcatOp, newConv);
     } else {
       SmallVector<int64_t> splitSizesVec;
       for (auto conv : parallelConvs) {
@@ -1181,15 +1227,15 @@ struct CombineParallelConv2DPattern : public OpRewritePattern<ONNXConvOp> {
         splitSizesVec.push_back(channels);
       }
 
-      rewriter.setInsertionPointAfter(newConv);
       ValueRange splitResults = onnx_mlir::emitSplitByChannels(
           rewriter, loc, newConv.getResult(), splitSizesVec, concatAxis);
-
       for (size_t i = 0; i < parallelConvs.size(); ++i) {
-        parallelConvs[i].getResult().replaceAllUsesWith(splitResults[i]);
+        rewriter.replaceAllOpUsesWith(parallelConvs[i], splitResults[i]);
       }
+      // Sort the block topological, as the operations after the split may be in
+      // the wrong place otherwise
+      mlir::sortTopologically(newConv->getBlock());
     }
-
     for (auto conv : parallelConvs) {
       rewriter.eraseOp(conv);
     }
@@ -1273,8 +1319,7 @@ void onnx_mlir::getRecomposeONNXToONNXPatterns(
   patterns.insert<RecomposeDepthToSpaceDCR>(context);
   // AMD Disabled as downstream has no special support for it
   // patterns.insert<RecomposeQLinearMatMulFromQuantizeLinearPattern>(context);
-  // AMD Temporary disabled as this pattern is buggy.
-  // patterns.insert<CombineParallelConv2DPattern>(context);
+  patterns.insert<CombineParallelConv2DPattern>(context);
 }
 
 /*!
diff --git a/test/mlir/onnx/onnx_recompose_combine_parallel_conv.mlir b/test/mlir/onnx/onnx_recompose_combine_parallel_conv.mlir
@@ -1,8 +1,5 @@
 // RUN: onnx-mlir  --useOnnxModelTypes=false --EmitONNXIR --printIR %s | FileCheck %s
 
-// Temporary disabled
-// XFAIL: *
-
 func.func @test_conv_concat_simple(%arg0: tensor<1x1x512x512xf32>) -> tensor<1x64x512x512xf32> {
   %0 = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
   %1 = onnx.Constant dense<0.00999999977> : tensor<32xf32>
@@ -118,4 +115,50 @@ func.func @test_combine_conv_split(%arg0: tensor<1x1x512x512xf32>) -> tensor<1x9
 // CHECK: [[FINAL_OUT:%.+]] = "onnx.Concat"([[VAR_3_]], [[VAR_4_]], [[VAR_5_]]) {axis = 1 : si64, onnx_node_name = "onnx.Concat_4_7"} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x96x512x512xf32>
 // CHECK: return [[FINAL_OUT]] : tensor<1x96x512x512xf32>
 
-}
+}
+
+func.func @test_conv_concat_dependency(%arg0: tensor<1x1x512x512xf32>) -> tensor<1x64x512x512xf32> {
+  %0 = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+  %1 = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+  %2 = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+  %3 = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+  %4 = "onnx.Conv"(%arg0, %0, %1) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+  %reduceAxes = onnx.Constant dense<[0, 2, 3]> : tensor<3xi64>
+  %reduced = "onnx.ReduceMean"(%4, %reduceAxes) {keepdims = 0 : si64} : (tensor<1x32x512x512xf32>, tensor<3xi64>) -> tensor<32xf32>
+  %5 = "onnx.Conv"(%arg0, %2, %reduced) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+  %6 = "onnx.Concat"(%4, %5) {axis = 1 : si64} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x64x512x512xf32>
+  return %6 : tensor<1x64x512x512xf32>
+
+// COM: Can not be rewritten as there is a def-use chain between the Convs
+// CHECK-LABEL:  func.func @test_conv_concat_dependency
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x1x512x512xf32>) -> tensor<1x64x512x512xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<[0, 2, 3]> : tensor<3xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+// CHECK:           [[VAR_3_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[VAR_1_]], [[VAR_2_]]) {auto_pad = "NOTSET", group = 1 : si64, onnx_node_name = "onnx.Conv_8", pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+// CHECK:           [[VAR_4_:%.+]] = "onnx.ReduceMean"([[VAR_3_]], [[VAR_0_]]) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64, onnx_node_name = "onnx.ReduceMean_9"} : (tensor<1x32x512x512xf32>, tensor<3xi64>) -> tensor<32xf32>
+// CHECK:           [[VAR_5_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[VAR_1_]], [[VAR_4_]]) {auto_pad = "NOTSET", group = 1 : si64, onnx_node_name = "onnx.Conv_10", pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+// CHECK:           [[VAR_6_:%.+]] = "onnx.Concat"([[VAR_3_]], [[VAR_5_]]) {axis = 1 : si64, onnx_node_name = "onnx.Concat_11"} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x64x512x512xf32>
+// CHECK:           return [[VAR_6_]] : tensor<1x64x512x512xf32>
+// CHECK:         }
+}
+
+func.func @test_conv_concat_not_static_shape(%arg0: tensor<1x1x512x512xf32>, %0: tensor<*xf32>) -> tensor<1x64x512x512xf32> {
+  %1 = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+  %2 = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+  %3 = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+  %4 = "onnx.Conv"(%arg0, %0, %1) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<*xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+  %5 = "onnx.Conv"(%arg0, %2, %3) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x?x512x512xf32>
+  %6 = "onnx.Concat"(%4, %5) {axis = 1 : si64} : (tensor<1x32x512x512xf32>, tensor<1x?x512x512xf32>) -> tensor<1x64x512x512xf32>
+  return %6 : tensor<1x64x512x512xf32>
+
+// CHECK-LABEL:  func.func @test_conv_concat_not_static_shape
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x1x512x512xf32>, [[PARAM_1_:%.+]]: tensor<*xf32>) -> tensor<1x64x512x512xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[PARAM_1_]], [[VAR_0_]]) {auto_pad = "NOTSET", group = 1 : si64, onnx_node_name = "onnx.Conv_12", pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<*xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[VAR_1_]], [[VAR_0_]]) {auto_pad = "NOTSET", group = 1 : si64, onnx_node_name = "onnx.Conv_13", pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<32x1x3x3xf32>, tensor<32xf32>) -> tensor<1x32x512x512xf32>
+// CHECK:           [[VAR_4_:%.+]] = "onnx.Concat"([[VAR_2_]], [[VAR_3_]]) {axis = 1 : si64, onnx_node_name = "onnx.Concat_14"} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x64x512x512xf32>
+// CHECK:           return [[VAR_4_]] : tensor<1x64x512x512xf32>
+// CHECK:         }
+}
diff --git a/test/mlir/onnx/onnx_recompose_locations.mlir b/test/mlir/onnx/onnx_recompose_locations.mlir
@@ -46,28 +46,28 @@ func.func @test_combine_conv_split(%arg0: tensor<1x1x512x512xf32>) -> tensor<1x9
   return %12 : tensor<1x96x512x512xf32>
 
 // CHECK-LABEL:  func.func @test_combine_conv_split
-// XFAIL-CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x1x512x512xf32>
-// XFAIL-CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<32> : tensor<3xi64>
-// XFAIL-CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
-// XFAIL-CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32xf32>
-// XFAIL-CHECK-NOT: separator of consecutive DAGs
-// XFAIL-CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.Concat"([[VAR_1_]], [[VAR_1_]], [[VAR_1_]]) {axis = 0 : si64} : (tensor<32x1x3x3xf32>, tensor<32x1x3x3xf32>, tensor<32x1x3x3xf32>) -> tensor<96x1x3x3xf32> loc([[LOC_FUSED:#.+]])
-// XFAIL-CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Concat"([[VAR_2_]], [[VAR_2_]], [[VAR_2_]]) {axis = 0 : si64} : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) -> tensor<96xf32> loc([[LOC_FUSED:#.+]])
-// XFAIL-CHECK:           [[VAR_5_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[VAR_3_]], [[VAR_4_]]) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<96x1x3x3xf32>, tensor<96xf32>) -> tensor<1x96x512x512xf32> loc([[LOC_FUSED:#.+]])
-// XFAIL-CHECK:           [[VAR_6_:%.+]]:3 = "onnx.Split"([[VAR_5_]], [[VAR_0_]]) {axis = 1 : si64} : (tensor<1x96x512x512xf32>, tensor<3xi64>) -> (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) loc([[LOC_FUSED:#.+]])
-// XFAIL-CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Relu"([[VAR_6_]]#2) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_RELU:#.+]])
-// XFAIL-CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Sigmoid"([[VAR_6_]]#1) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_SIGMOID:#.+]])
-// XFAIL-CHECK-DAG:       [[VAR_9_:%.+]] = "onnx.Tanh"([[VAR_6_]]#0) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_TANH:#.+]])
-// XFAIL-CHECK:           [[VAR_10_:%.+]] = "onnx.Concat"([[VAR_7_]], [[VAR_8_]], [[VAR_9_]]) {axis = 1 : si64} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x96x512x512xf32> loc([[LOC_ORIGINAL_CONCAT:#.+]])
-// XFAIL-CHECK:           return [[VAR_10_]] : tensor<1x96x512x512xf32>
-// XFAIL-CHECK:         }
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x1x512x512xf32>
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<32> : tensor<3xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32x1x3x3xf32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<0.00999999977> : tensor<32xf32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.Concat"([[VAR_1_]], [[VAR_1_]], [[VAR_1_]]) {axis = 0 : si64} : (tensor<32x1x3x3xf32>, tensor<32x1x3x3xf32>, tensor<32x1x3x3xf32>) -> tensor<96x1x3x3xf32> loc([[LOC_FUSED:#.+]])
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Concat"([[VAR_2_]], [[VAR_2_]], [[VAR_2_]]) {axis = 0 : si64} : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) -> tensor<96xf32> loc([[LOC_FUSED:#.+]])
+// CHECK:           [[VAR_5_:%.+]] = "onnx.Conv"([[PARAM_0_]], [[VAR_3_]], [[VAR_4_]]) {auto_pad = "NOTSET", group = 1 : si64, pads = [1, 1, 1, 1]} : (tensor<1x1x512x512xf32>, tensor<96x1x3x3xf32>, tensor<96xf32>) -> tensor<1x96x512x512xf32> loc([[LOC_FUSED:#.+]])
+// CHECK:           [[VAR_6_:%.+]]:3 = "onnx.Split"([[VAR_5_]], [[VAR_0_]]) {axis = 1 : si64} : (tensor<1x96x512x512xf32>, tensor<3xi64>) -> (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) loc([[LOC_FUSED:#.+]])
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Relu"([[VAR_6_]]#2) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_RELU:#.+]])
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Sigmoid"([[VAR_6_]]#1) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_SIGMOID:#.+]])
+// CHECK-DAG:       [[VAR_9_:%.+]] = "onnx.Tanh"([[VAR_6_]]#0) : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32> loc([[LOC_TANH:#.+]])
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Concat"([[VAR_7_]], [[VAR_8_]], [[VAR_9_]]) {axis = 1 : si64} : (tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>, tensor<1x32x512x512xf32>) -> tensor<1x96x512x512xf32> loc([[LOC_ORIGINAL_CONCAT:#.+]])
+// CHECK:           return [[VAR_10_]] : tensor<1x96x512x512xf32>
+// CHECK:         }
 
-// XFAIL-CHECK-DAG:       [[LOC_RELU:#.+]] = loc("relu")
-// XFAIL-CHECK-DAG:       [[LOC_SIGMOID:#.+]] = loc("sigmoid")
-// XFAIL-CHECK-DAG:       [[LOC_TANH:#.+]] = loc("tanh")
-// XFAIL-CHECK-DAG:       [[LOC_ORIGINAL_CONCAT:#.+]] = loc("concat")
-// XFAIL-CHECK-DAG:       [[LOC_CONV1:#.+]] = loc("conv1")
-// XFAIL-CHECK-DAG:       [[LOC_CONV2:#.+]] = loc("conv2")
-// XFAIL-CHECK-DAG:       [[LOC_CONV3:#.+]] = loc("conv3")
-// XFAIL-CHECK-DAG:       [[LOC_FUSED]] = loc(fused[[[LOC_CONV1]], [[LOC_CONV3]], [[LOC_CONV2]]])
-}
+// CHECK-DAG:       [[LOC_RELU:#.+]] = loc("relu")
+// CHECK-DAG:       [[LOC_SIGMOID:#.+]] = loc("sigmoid")
+// CHECK-DAG:       [[LOC_TANH:#.+]] = loc("tanh")
+// CHECK-DAG:       [[LOC_ORIGINAL_CONCAT:#.+]] = loc("concat")
+// CHECK-DAG:       [[LOC_CONV1:#.+]] = loc("conv1")
+// CHECK-DAG:       [[LOC_CONV2:#.+]] = loc("conv2")
+// CHECK-DAG:       [[LOC_CONV3:#.+]] = loc("conv3")
+// CHECK-DAG:       [[LOC_FUSED]] = loc(fused[[[LOC_CONV1]], [[LOC_CONV3]], [[LOC_CONV2]]])
+}