Use scf::tileAndFuseConsumer in GPUFuseAndHoistParallelLoops (#22617)

MaheshRavishankar · web-flow · commit 645b44691e4c · 2025-11-25T10:16:02.000-08:00
This change simplifies the logic in `GPUFuseAndHoistParallelLoops` by using the `scf::tileAndFuseConsumer` method that directly takes the consumer to fuse as operand and find the slices to fuse along. The previous implementation related in a subtle bug, where the operation that was being expected to fuse and the actual operation fused were different. The new methods disallows this by construction. There is still an issue of the pattern rewrite still going into an infinite loop (or hitting the limit). That is a problem because of the tiling generating operations before failing. The tiling method is not intended to be called within pattern rewriters, but some outstanding changes to `TilingInterface` can also address this issue. Leaving this as an error for now. Fixes #22576 Signed-off-by: MaheshRavishankar <mahesh.ravishankar@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUFuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUFuseAndHoistParallelLoops.cpp
@@ -358,6 +358,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
     patterns.add<FuseNestedLaneAndWarpForalls>(context);
     populateForallLoopHoistingPattern(patterns);
     if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+      funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 1)");
       return signalPassFailure();
     }
   }
@@ -379,6 +380,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
     tensor::populateFoldTensorEmptyPatterns(patterns);
     scf::ForallOp::getCanonicalizationPatterns(patterns, context);
     if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+      funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 2)");
       return signalPassFailure();
     }
   }
@@ -393,6 +395,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
     tensor::populateFoldTensorEmptyPatterns(patterns);
     scf::ForallOp::getCanonicalizationPatterns(patterns, context);
     if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+      funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 3)");
       return signalPassFailure();
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_fuse_and_hoist_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_fuse_and_hoist_forall.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-codegen-gpu-fuse-and-hoist-parallel-loops))' --split-input-file | FileCheck %s
+// RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-codegen-gpu-fuse-and-hoist-parallel-loops))' --split-input-file --verify-diagnostics | FileCheck %s
 
 #translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
 
@@ -486,7 +486,7 @@ func.func @forall_hoist_unit_loop_with_fill(%3: tensor<1x128xf16>, %4: tensor<12
 
 // -----
 
-func.func @no_fuse_multi_use(%2: tensor<128x128xf16>, %3: tensor<128x128xf16>) -> tensor<128x128xf16> {
+func.func @fuse_multi_use(%2: tensor<128x128xf16>, %3: tensor<128x128xf16>) -> tensor<128x128xf16> {
   %c4 = arith.constant 4 : index
   %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
@@ -496,10 +496,9 @@ func.func @no_fuse_multi_use(%2: tensor<128x128xf16>, %3: tensor<128x128xf16>) -
     %extracted_slice_2 = tensor.extract_slice %arg7[%arg5, %arg6] [2, 2] [1, 1] : tensor<128x128xf16> to tensor<2x2xf16>
     %extracted_slice_3 = tensor.extract_slice %arg8[%arg6, %arg5] [2, 2] [1, 1] : tensor<128x128xf16> to tensor<2x2xf16>
     %16 = linalg.copy ins(%extracted_slice_1 : tensor<2x2xf16>) outs(%extracted_slice_2 : tensor<2x2xf16>) -> tensor<2x2xf16>
-    %17 = linalg.transpose ins(%extracted_slice_1 : tensor<2x2xf16>) outs(%extracted_slice_3 : tensor<2x2xf16>) permutation = [1, 0]
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %16 into %arg7[%arg5, %arg6] [2, 2] [1, 1] : tensor<2x2xf16> into tensor<128x128xf16>
-      tensor.parallel_insert_slice %17 into %arg8[%arg6, %arg5] [2, 2] [1, 1] : tensor<2x2xf16> into tensor<128x128xf16>
+      tensor.parallel_insert_slice %16 into %arg8[%arg5, %arg6] [2, 2] [1, 1] : tensor<2x2xf16> into tensor<128x128xf16>
     }
   } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
   %add = linalg.add
@@ -508,14 +507,42 @@ func.func @no_fuse_multi_use(%2: tensor<128x128xf16>, %3: tensor<128x128xf16>) -
   return %add : tensor<128x128xf16>
 }
 
-// CHECK-LABEL: func @no_fuse_multi_use
+// CHECK-LABEL: func @fuse_multi_use(
 //       CHECK:   scf.forall
 //       CHECK:     linalg.copy
-//       CHECK:     linalg.transpose
+//       CHECK:     linalg.add
 //       CHECK:   scf.forall.in_parallel
-//       CHECK:   linalg.add
 //       CHECK:   return
 
+
+// -----
+
+// For now this test errors out cause the pattern rewriter goes into an infinite loop. This happens cause the consumer
+// fusion fails, but modified the IR before failing. This will be fixed shortly upstream.
+
+// expected-error @+1 {{failed to apply fusion + hoisting patterns (set 1)}}
+func.func @no_fuse_incompatible_multi_use(%2: tensor<128x128xf16>, %3: tensor<128x128xf16>) -> tensor<128x128xf16> {
+  %c4 = arith.constant 4 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %empty = tensor.empty() : tensor<128x128xf16>
+  %10:2 = scf.forall (%arg5, %arg6) in (32, 32) shared_outs(%arg7 = %empty, %arg8 = %empty) -> (tensor<128x128xf16>, tensor<128x128xf16>) {
+    %extracted_slice_1 = tensor.extract_slice %2[%arg5, %arg6] [2, 2] [1, 1] : tensor<128x128xf16> to tensor<2x2xf16>
+    %extracted_slice_2 = tensor.extract_slice %arg7[%arg5, %arg6] [2, 2] [1, 1] : tensor<128x128xf16> to tensor<2x2xf16>
+    %extracted_slice_3 = tensor.extract_slice %arg8[%arg6, %arg5] [2, 2] [1, 1] : tensor<128x128xf16> to tensor<2x2xf16>
+    %16 = linalg.copy ins(%extracted_slice_1 : tensor<2x2xf16>) outs(%extracted_slice_2 : tensor<2x2xf16>) -> tensor<2x2xf16>
+    %17 = linalg.transpose ins(%extracted_slice_1 : tensor<2x2xf16>) outs(%extracted_slice_3 : tensor<2x2xf16>) permutation = [1, 0]
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %16 into %arg7[%arg5, %arg6] [2, 2] [1, 1] : tensor<2x2xf16> into tensor<128x128xf16>
+      tensor.parallel_insert_slice %17 into %arg8[%arg6, %arg5] [2, 2] [1, 1] : tensor<2x2xf16> into tensor<128x128xf16>
+    }
+  } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
+  %add = linalg.add
+    ins(%10#0, %10#1 : tensor<128x128xf16>, tensor<128x128xf16>)
+    outs(%empty: tensor<128x128xf16>) -> tensor<128x128xf16>
+  return %add : tensor<128x128xf16>
+}
+
 // -----
 
 #map = affine_map<(d0) -> (d0 * 64)>
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
@@ -48,7 +48,6 @@ struct FuseTilableForallConsumers final
 
     tensor::ParallelInsertSliceOp producerSlice;
     LoopLikeOpInterface sliceOwner;
-    Value fusionOperand;
     for (auto operand : dpsOp.getDpsInputs()) {
       auto forallProducer = operand.getDefiningOp<scf::ForallOp>();
       if (!forallProducer) {
@@ -57,36 +56,15 @@ struct FuseTilableForallConsumers final
       if (forallProducer->getBlock() != tilableOp->getBlock()) {
         continue;
       }
-      Value iterArg = forallProducer.getTiedBlockArgument(
-          forallProducer.getTiedOpOperand(cast<OpResult>(operand)));
-
-      for (auto user : iterArg.getUsers()) {
-        auto sliceOp = dyn_cast<tensor::ParallelInsertSliceOp>(user);
-        if (sliceOp && sliceOp.getDest() == iterArg) {
-          producerSlice = sliceOp;
-          sliceOwner = forallProducer;
-          fusionOperand = operand;
-          break;
-        }
-      }
-      if (producerSlice) {
-        break;
-      }
+      sliceOwner = forallProducer;
+      break;
     }
 
-    if (!producerSlice) {
+    if (!sliceOwner) {
       return rewriter.notifyMatchFailure(tilableOp,
                                          "no scf.forall producer to fuse into");
     }
 
-    for (auto operand : tilableOp->getOperands()) {
-      if (operand != fusionOperand && operand.getDefiningOp() == sliceOwner) {
-        return rewriter.notifyMatchFailure(tilableOp,
-                                           "unimplemented: Cannot fuse op with "
-                                           "multiple uses of producer loop");
-      }
-    }
-
     // The `tileAndFuseConsumerOfSlices` transform will fail if there are any
     // users of the loop that do not dominate the `tilableOp`, so we move the
     // `tilableOp` and any producers needed for dominance right after the loop.
@@ -116,8 +94,7 @@ struct FuseTilableForallConsumers final
     }
 
     FailureOr<scf::SCFFuseConsumerOfSliceResult> fuseConsumerResults =
-        scf::tileAndFuseConsumerOfSlices(rewriter, producerSlice.getOperation(),
-                                         {sliceOwner});
+        scf::tileAndFuseConsumer(rewriter, tilableOp, {sliceOwner});
     if (failed(fuseConsumerResults)) {
       return failure();
     }

Original file line number	Diff line number	Diff line change
`@@ -358,6 +358,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {`
`358`	`358`	`patterns.add<FuseNestedLaneAndWarpForalls>(context);`
`359`	`359`	`populateForallLoopHoistingPattern(patterns);`
`360`	`360`	`if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {`
	`361`	`+ funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 1)");`
`361`	`362`	`return signalPassFailure();`
`362`	`363`	`}`
`363`	`364`	`}`
`@@ -379,6 +380,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {`
`379`	`380`	`tensor::populateFoldTensorEmptyPatterns(patterns);`
`380`	`381`	`scf::ForallOp::getCanonicalizationPatterns(patterns, context);`
`381`	`382`	`if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {`
	`383`	`+ funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 2)");`
`382`	`384`	`return signalPassFailure();`
`383`	`385`	`}`
`384`	`386`	`}`
`@@ -393,6 +395,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {`
`393`	`395`	`tensor::populateFoldTensorEmptyPatterns(patterns);`
`394`	`396`	`scf::ForallOp::getCanonicalizationPatterns(patterns, context);`
`395`	`397`	`if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {`
	`398`	`+ funcOp->emitOpError("failed to apply fusion + hoisting patterns (set 3)");`
`396`	`399`	`return signalPassFailure();`
`397`	`400`	`}`
`398`	`401`	`}`