[CPU] Use scf.forall for TileRootAndFuseProducerConsumer by default. (iree-org#21260)

hanhanW · web-flow · commit cd29f38b0b27 · 2025-07-01T19:47:19.000Z
The revision drops the option and switch to scf.forall by default, when
tile and fuse the parallel dimensions.

To finish the migration, it updates the LinalgExt pipeline and adds the
ForallToFor pass before vectorization.

Signed-off-by: hanhanW &lt;hanhan0912@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp
@@ -37,11 +37,11 @@ namespace mlir::iree_compiler {
 /// the root operation and fuse the producers of the root operation then
 /// consumers (finds any missing fusion opportunities, then apply producer
 /// fusion). If `onlyFuseProducerInputOperands` is set, only fuse producer input
-/// operands. If `tileUsingForall` is set, creates `scf.forall`, rather than
-/// `scf.for` loops during tiling.
-static FailureOr<Operation *> tileRootAndFuseProducerConsumer(
-    IRRewriter &rewriter, TilingInterface rootOp, int64_t tilingLevel,
-    bool onlyFuseProducerInputOperands, bool tileUsingForall) {
+/// operands.
+static FailureOr<Operation *>
+tileRootAndFuseProducerConsumer(IRRewriter &rewriter, TilingInterface rootOp,
+                                int64_t tilingLevel,
+                                bool onlyFuseProducerInputOperands) {
   auto *context = rewriter.getContext();
   mlir::DominanceInfo dominanceInfo(rootOp);
   llvm::SmallDenseSet<Operation *> tiledAndFusedOps;
@@ -88,7 +88,7 @@ static FailureOr<Operation *> tileRootAndFuseProducerConsumer(
   tilingOptions.setTileSizes(tileSizes);
 
   // onlyFuseProducerInputOperands implies reduction tiling.
-  if (tileUsingForall && !onlyFuseProducerInputOperands) {
+  if (!onlyFuseProducerInputOperands) {
     tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
   }
 
@@ -218,7 +218,7 @@ void LLVMCPUTileRootAndFuseProducerConsumer::runOnOperation() {
 
   if (failed(tileRootAndFuseProducerConsumer(
           rewriter, cast<TilingInterface>(rootOp.value()), tilingLevel,
-          onlyFuseProducerInputOperands, tileUsingForall))) {
+          onlyFuseProducerInputOperands))) {
     funcOp.emitError() << "tiling of level " << tilingLevel.getValue()
                        << " failed\n";
     return signalPassFailure();
@@ -242,20 +242,17 @@ void LLVMCPUTileRootAndFuseProducerConsumer::runOnOperation() {
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPUTileRootAndFuseProducerConsumer(int64_t tilingLevel,
-                                             bool tileUsingForAll) {
+createLLVMCPUTileRootAndFuseProducerConsumer(int64_t tilingLevel) {
   LLVMCPUTileRootAndFuseProducerConsumerPassOptions options;
   options.tilingLevel = tilingLevel;
   options.onlyFuseProducerInputOperands = false;
-  options.tileUsingForall = tileUsingForAll;
   return std::make_unique<LLVMCPUTileRootAndFuseProducerConsumer>(options);
 }
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTileRootAndFuseInputOperands(int64_t tilingLevel) {
   LLVMCPUTileRootAndFuseProducerConsumerPassOptions options;
   options.tilingLevel = tilingLevel;
   options.onlyFuseProducerInputOperands = true;
-  options.tileUsingForall = false;
   return std::make_unique<LLVMCPUTileRootAndFuseProducerConsumer>(options);
 }
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -532,8 +532,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
   addTileAndDistributePasses(funcPassManager);
 
   funcPassManager.addPass(createLLVMCPUTileRootAndFuseProducerConsumer(
-      static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel()),
-      /*tileUsingForall=*/true));
+      static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
   // The below two passes are nop if the "mmt4d" is explicitly excluded in the
   // ukernels attribute.
   funcPassManager.addPass(createCPUPrepareUkernelsPass());
@@ -647,6 +646,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
   funcPassManager.addPass(
       IREE::LinalgExt::createDecomposeWinogradTransformPass());
   funcPassManager.addPass(IREE::LinalgExt::createDecomposeAttentionPass());
+  funcPassManager.addPass(iree_compiler::createForallToForPass());
 
   {
     GenericVectorizationPassOptions options;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -43,8 +43,7 @@ std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTileAndFusePass(int64_t tilingLevel);
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPUTileRootAndFuseProducerConsumer(int64_t tilingLevel,
-                                             bool tileUsingForall);
+createLLVMCPUTileRootAndFuseProducerConsumer(int64_t tilingLevel);
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTileRootAndFuseInputOperands(int64_t tilingLevel);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
@@ -160,10 +160,7 @@ def LLVMCPUTileRootAndFuseProducerConsumerPass
               "only-fuse-producer-input-operands", "bool",
               /*default=*/"false",
               "Specifies if we only want to fuse producer's input operands. "
-              "This is helpful to tile&fuse in case of reduction dimensions.">,
-       Option<"tileUsingForall", "tile-using-forall", "bool",
-              /*default=*/"false",
-              "Tile parallel dimensions using `scf.forall` instead of `scf.for`. Reduction dimension defaults to `scf.for`.">];
+              "This is helpful to tile&fuse in case of reduction dimensions.">];
 }
 
 def LLVMCPUVerifyVectorSizeLegalityPass :
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_root_and_fuse_producer_consumer.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_root_and_fuse_producer_consumer.mlir
@@ -1,5 +1,4 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=0}), cse)"  --split-input-file %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=0 tile-using-forall=true}), cse)"  --split-input-file %s | FileCheck %s --check-prefix=CHECK-FORALL
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=2 only-fuse-producer-input-operands=true}), cse)"  --split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 0, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
@@ -24,21 +23,12 @@ func.func @mmt4d_bias_relu(%arg0: tensor<?x?x16x1xf32>, %arg1: tensor<?x?x16x1xf
   return %4 : tensor<?x?x16x16xf32>
 }
 // CHECK-LABEL: func.func @mmt4d_bias_relu(
-// CHECK:         scf.for
+// CHECK:         scf.forall
 // CHECK:           linalg.fill
 // CHECK-NEXT:      %[[MMT4D:.+]] = linalg.mmt4d
 // CHECK:           %[[ELEM:.+]] = linalg.generic
-// CHECK:           %[[RES0:.+]] =  tensor.insert_slice %[[MMT4D]]
-// CHECK:           %[[RES1:.+]] =  tensor.insert_slice %[[ELEM]]
-// CHECK:           scf.yield %[[RES0]], %[[RES1]]
-
-// CHECK-FORALL-LABEL: func.func @mmt4d_bias_relu(
-// CHECK-FORALL:         scf.forall
-// CHECK-FORALL:           linalg.fill
-// CHECK-FORALL-NEXT:      %[[MMT4D:.+]] = linalg.mmt4d
-// CHECK-FORALL:           %[[ELEM:.+]] = linalg.generic
-// CHECK-FORALL:           scf.forall.in_parallel
-// CHECK-FORALL:             tensor.parallel_insert_slice %[[ELEM]]
+// CHECK:           scf.forall.in_parallel
+// CHECK:             tensor.parallel_insert_slice %[[ELEM]]
 
 // -----
 
@@ -72,26 +62,15 @@ func.func @quantized_matmul(%arg0: tensor<2x4x128x16x1xi8>, %arg1: tensor<2x4x16
   %unpack = linalg.unpack %6 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 16] into %7 : tensor<2x4x688x16x16xf32> -> tensor<2x11008x64xf32>
   return %unpack : tensor<2x11008x64xf32>
 }
-// CHECK: func.func @quantized_matmul(
-// CHECK:  scf.for
-// CHECK:    linalg.generic
-// CHECK:    linalg.generic
-// CHECK:    linalg.fill
-// CHECK:    %[[MMT4D:.+]] = linalg.batch_mmt4d
-// CHECK:    %[[UNPACK:.+]] = linalg.unpack
-// CHECK:    %[[RES0:.+]] =  tensor.insert_slice %[[MMT4D]]
-// CHECK:    %[[RES1:.+]] =  tensor.insert_slice %[[UNPACK]]
-// CHECK:    scf.yield %[[RES0]], %[[RES1]]
-
-// CHECK-FORALL-LABEL: func.func @quantized_matmul(
-// CHECK-FORALL:         scf.forall
-// CHECK-FORALL:           linalg.generic
-// CHECK-FORALL:           linalg.generic
-// CHECK-FORALL:           linalg.fill
-// CHECK-FORALL:           %[[MMT4D:.+]] = linalg.batch_mmt4d
-// CHECK-FORALL:           %[[UNPACK:.+]] = linalg.unpack
-// CHECK-FORALL:           scf.forall.in_parallel
-// CHECK-FORALL:             tensor.parallel_insert_slice %[[UNPACK]]
+// CHECK-LABEL: func.func @quantized_matmul(
+// CHECK:         scf.forall
+// CHECK:           linalg.generic
+// CHECK:           linalg.generic
+// CHECK:           linalg.fill
+// CHECK:           %[[MMT4D:.+]] = linalg.batch_mmt4d
+// CHECK:           %[[UNPACK:.+]] = linalg.unpack
+// CHECK:           scf.forall.in_parallel
+// CHECK:             tensor.parallel_insert_slice %[[UNPACK]]
 
 // -----