[CPU] Tile reduction dimensions for non-root reduction ops. (iree-org#21500)

hanhanW · keshavvinayak01 · commit ef0d6dd95e53 · 2025-09-04T18:04:04.000Z
The revision adds an option to skip root op in LLVMCPUTile pass, and uses it in multi level tiling pipeline. In softmax dispatch, there are two reduction ops. Only the root op is tiled for reduction dimensions when we switched to LLVMCPUTileRootAndFuseInputOperandsPass. It results in large vector sizes in the other reduction op when `util.assume.hint` ops are present. We did not hit the issue in e2e tests because AnnotateDispatchAssumptions pass behaves differently. The value range is [0, 0] if the input is from `flow.tensor.dynamic_constant`. Fixes iree-org#21359 --------- Signed-off-by: hanhanW <hanhan0912@gmail.com> Signed-off-by: keshavvinayak01 <keshavvinayakjha@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTile.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTile.cpp
@@ -36,9 +36,7 @@ namespace {
 /// lowering_config.
 struct LLVMCPUTilePass : impl::LLVMCPUTilePassBase<LLVMCPUTilePass> {
   using impl::LLVMCPUTilePassBase<LLVMCPUTilePass>::LLVMCPUTilePassBase;
-  explicit LLVMCPUTilePass(int64_t tilingLevel) {
-    this->tilingLevel = tilingLevel;
-  }
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<arith::ArithDialect, affine::AffineDialect,
                     linalg::LinalgDialect, scf::SCFDialect,
@@ -75,8 +73,17 @@ void LLVMCPUTilePass::runOnOperation() {
       LDBG("can't find lowering_config, skip tiling");
       continue;
     }
+    if (!maybeLoweringConfig.hasTilingLevel(tilingLevel)) {
+      LDBG("target tiling level does not exist");
+      continue;
+    }
 
     LDBG("candidate: " << op);
+    if (skipRootOp && maybeLoweringConfig.hasWorkgroupTilingLevel()) {
+      LDBG("skip tiling on the root op");
+      continue;
+    }
+
     auto tileSizesAttr = dyn_cast<IREE::Codegen::LoweringConfigTilingLevelAttr>(
         getLoweringConfig(op).getTilingLevelAttr(tilingLevel));
     SmallVector<int64_t> tileSizes(tileSizesAttr.getSizes());
@@ -115,8 +122,11 @@ void LLVMCPUTilePass::runOnOperation() {
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPUTilePass(int64_t tilingLevel) {
-  return std::make_unique<LLVMCPUTilePass>(tilingLevel);
+createLLVMCPUTilePass(int64_t tilingLevel, bool skipRootOp) {
+  LLVMCPUTilePassOptions options;
+  options.tilingLevel = tilingLevel;
+  options.skipRootOp = skipRootOp;
+  return std::make_unique<LLVMCPUTilePass>(options);
 }
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -365,8 +365,8 @@ void addCPUBufferOpsTileAndVectorizePipeline(
 
   // Skip tiling reduction loops because this is expected to apply on copy ops
   // only.
-  funcPassManager.addPass(
-      createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
+  funcPassManager.addPass(createLLVMCPUTilePass(
+      tilingConfig.getVectorCommonParallelLevel(), /*skipRootOp=*/false));
   funcPassManager.addPass(createLLVMCPUPeelPass());
   {
     GenericVectorizationPassOptions options;
@@ -422,6 +422,11 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
           createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
       funcPassManager.addPass(
           createLLVMCPUTileRootAndFuseInputOperandsPass(level));
+      // Tile all the reduction ops for target vector sizes, which ensures
+      // that all the dimensions are tiled in all the reduction ops. The root
+      // op is already tiled, so it is skipped in the pass.
+      funcPassManager.addPass(createLLVMCPUTilePass(
+          static_cast<IREE::CPU::TilingLevel>(i), /*skipRootOp=*/true));
       break;
     case IREE::CPU::TilingLevel::VectorInnerParallelTiles:
       funcPassManager.addPass(createLLVMCPUTileAndFusePass(
@@ -603,8 +608,8 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(
       createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
 
-  funcPassManager.addPass(
-      createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
+  funcPassManager.addPass(createLLVMCPUTilePass(
+      tilingConfig.getVectorCommonParallelLevel(), /*skipRootOp=*/false));
   if (pipelineOpt.decomposePackUnPackOps) {
     funcPassManager.addPass(createDecomposePackUnPackOpsPass());
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -38,7 +38,7 @@ std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUSplitReductionPass(bool enableReassociateFpReductions);
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPUTilePass(int64_t tilingLevel);
+createLLVMCPUTilePass(int64_t tilingLevel, bool skipRootOp);
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTileAndFusePass(int64_t tilingLevel);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
@@ -138,7 +138,9 @@ def LLVMCPUTilePass :
   }];
   let options = [
     Option<"tilingLevel", "tiling-level", "int64_t", /*default=*/"-1",
-      "Tiling level used to retrieve the configuration from lowering_config">
+      "Tiling level used to retrieve the configuration from lowering_config.">,
+    Option<"skipRootOp", "skip-root-op", "bool", /*default=*/"false",
+      "Do not tile the root op if the option is true.">
   ];
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -579,3 +579,79 @@ func.func @pooling_nchw_max_pack_with_padding_issue_20723() attributes {hal.exec
 // CHECK:           iree_linalg_ext.map_scatter
 // CHECK:         } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 // CHECK:         scf.forall
+
+// -----
+
+// Verify that the dispatch can be compiled without creating large vectors.
+
+#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "", max_stack_allocation_size = 32768 : i64, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#pipeline_layout = #hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+func.func @softmax_dynamic_with_assume_int_hints() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 0xFFC00000 : f32
+  %c1 = arith.constant 1 : index
+  %c32_i64 = arith.constant 32 : i64
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+  %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+  %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+  %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
+  %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
+  %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
+  %6 = arith.extui %0 : i32 to i64
+  %7 = arith.extui %1 : i32 to i64
+  %8 = arith.shli %7, %c32_i64 : i64
+  %9 = arith.ori %6, %8 : i64
+  %10 = arith.index_castui %9 : i64 to index
+  %11 = arith.extui %2 : i32 to i64
+  %12 = arith.extui %3 : i32 to i64
+  %13 = arith.shli %12, %c32_i64 : i64
+  %14 = arith.ori %11, %13 : i64
+  %15 = arith.index_castui %14 : i64 to index
+  %16 = arith.extui %4 : i32 to i64
+  %17 = arith.extui %5 : i32 to i64
+  %18 = arith.shli %17, %c32_i64 : i64
+  %19 = arith.ori %16, %18 : i64
+  %20 = arith.index_castui %19 : i64 to index
+  %21:3 = util.assume.int
+      %10<umin = 0, umax = 9007199254740991>,
+      %15<umin = 0, umax = 9007199254740991>,
+      %20<umin = 0, umax = 9007199254740991>
+    : index, index, index
+  %22 = iree_tensor_ext.dispatch.workload.ordinal %21#0, 0 : index
+  %23 = iree_tensor_ext.dispatch.workload.ordinal %21#1, 1 : index
+  %24 = iree_tensor_ext.dispatch.workload.ordinal %21#2, 2 : index
+  %25 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%22, %23, %24}
+  %26 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%22, %23, %24}
+  %27 = iree_tensor_ext.dispatch.tensor.load %25, offsets = [0, 0, 0], sizes = [%22, %23, %24], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%22, %23, %24} -> tensor<?x?x?xf32>
+  %28 = tensor.empty(%22, %23, %24) : tensor<?x?x?xf32>
+  %dim = tensor.dim %27, %c0 : tensor<?x?x?xf32>
+  %dim_1 = tensor.dim %27, %c1 : tensor<?x?x?xf32>
+  %29 = tensor.empty(%dim, %dim_1) : tensor<?x?xf32>
+  %30 = linalg.fill ins(%cst_0 : f32) outs(%29 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %31 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%27 : tensor<?x?x?xf32>) outs(%30 : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %35 = arith.maxnumf %in, %out : f32
+    linalg.yield %35 : f32
+  } -> tensor<?x?xf32>
+  %32 = linalg.fill ins(%cst : f32) outs(%29 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %33 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%27, %31 : tensor<?x?x?xf32>, tensor<?x?xf32>) outs(%32 : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_2: f32, %out: f32):
+    %35 = arith.subf %in, %in_2 : f32
+    %36 = math.exp %35 : f32
+    %37 = arith.addf %36, %out : f32
+    linalg.yield %37 : f32
+  } -> tensor<?x?xf32>
+  %34 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%27, %31, %33 : tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) outs(%28 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %in_2: f32, %in_3: f32, %out: f32):
+    %35 = arith.subf %in, %in_2 : f32
+    %36 = math.exp %35 : f32
+    %37 = arith.divf %36, %in_3 : f32
+    linalg.yield %37 : f32
+  } -> tensor<?x?x?xf32>
+  iree_tensor_ext.dispatch.tensor.store %34, %26, offsets = [0, 0, 0], sizes = [%22, %23, %24], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%22, %23, %24}
+  return
+}
+// CHECK-LABEL: func.func @softmax_dynamic_with_assume_int_hints(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile.mlir
@@ -1,7 +1,8 @@
+// `TilingLevel=0` indicates DistributionTiles in IREE::CPU::LoweringConfigAttr.
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=0}))" --split-input-file %s | FileCheck %s
+// `TilingLevel=4` indicates VectorCommonParallelTiles in IREE::CPU::LoweringConfigAttr.
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=3 skip-root-op=true}))" --split-input-file %s | FileCheck %s --check-prefix=SKIP-ROOT
 
-// `tiling-level=0`, which is the testing value of the pass option, indicates
-// distribution level tiling.
 #config0 = #iree_cpu.lowering_config<distribution = [10, 20]>
 #config1 = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>
 func.func @matmul_bias_add(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?xf32>) -> tensor<?x?xf32> {
@@ -98,3 +99,40 @@ func.func @do_not_tile_ukernel(%arg0: tensor<?x?x16x1xf32>, %arg1: tensor<?x?x16
 // CHECK-LABEL: func.func @do_not_tile_ukernel
 // CHECK-NOT:     scf.for
 // CHECK:         iree_codegen.ukernel.generic
+
+// -----
+
+#config0 = #iree_cpu.lowering_config<vector_common_parallel =  [10, 20]>
+#config1 = #iree_cpu.lowering_config<distribution = [10, 20, 30]>
+func.func @matmul_bias_add_skip_matmul(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?xf32>) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
+  %0 = linalg.fill {lowering_config = #config0} ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.matmul {lowering_config = #config1}
+      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %2 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1)-> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%1, %arg2 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%init : tensor<?x?xf32>) attrs = {lowering_config = #config0} {
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+        %3 = arith.addf %arg3, %arg4 : f32
+        linalg.yield %3 : f32
+    } -> tensor<?x?xf32>
+  return %2 : tensor<?x?xf32>
+}
+// SKIP-ROOT: func.func @matmul_bias_add_skip_matmul
+// SKIP-ROOT:   scf.for
+// SKIP-ROOT:     scf.for
+// SKIP-ROOT:       linalg.fill
+// SKIP-ROOT:     scf.yield
+// SKIP-ROOT:   scf.yield
+// SKIP-ROOT:   linalg.matmul
+// SKIP-ROOT:   scf.for
+// SKIP-ROOT:     scf.for
+// SKIP-ROOT:       linalg.generic