[GPU] Pad fusion support for TileAndDistributeToWorkgroupsUsingForall (iree-org#20258)

nirvedhmeshram · web-flow · commit a5e2d27a8019 · 2025-03-18T11:24:33.000-05:00
Do not generate zero slice guard by using an explicit pattern to swap extract slice with pad in TileAndDistributeToWorkgroupsUsingForall. Fixes : iree-org#20253 Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -503,6 +503,12 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   // TODO(Max191): Replace populateSwapExtractWithExpandPattern with upstream
   // MLIR version once it is available (llvm-project/pull/126898).
   populateSwapExtractWithExpandPattern(cleanupPatterns);
+  // When fusing pads we do not want to generate zeroSliceGuards when doing
+  // workgroup tiling. In `GPUApplyTilingLevelPass` we do have an option called
+  // `allowZeroSlices` that can control this but we do not want these
+  // generated if workgroup tiling is happening first.
+  cleanupPatterns.insert<linalg::ExtractSliceOfPadTensorSwapPattern>(
+      context, [](tensor::ExtractSliceOp) { return /*zeroSliceGuard=*/false; });
   tileAndFuseOptions.cleanupPatterns =
       FrozenRewritePatternSet(std::move(cleanupPatterns));
 
@@ -513,6 +519,9 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
           bool isDestinationOperand)
       -> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
     Operation *owner = originalProducer.getOwner();
+    if (isa<tensor::PadOp>(owner)) {
+      return std::nullopt;
+    }
     bool yieldProducerReplacement = yieldReplacementsFor.contains(owner);
     return scf::SCFTileAndFuseOptions::ControlFnResult{
         yieldProducerReplacement};
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir
@@ -803,3 +803,23 @@ func.func @set_encoding_gpu(%arg0 : tensor<?x?xi8>) -> tensor<?x?x8x4x4x4x2x8xi8
 //       CHECK:     tensor.expand_shape
 //       CHECK:     linalg.generic
 //       CHECK:     tensor.parallel_insert_slice
+
+// -----
+
+func.func @pad_fusion(%0 : tensor<?x?xf32>, %1 : tensor<?x?xf32>, %2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %padded = tensor.pad %0 low[1, 1] high[1, 1] {
+  ^bb0(%arg0: index, %arg1: index):
+    tensor.yield %cst : f32
+  } : tensor<?x?xf32> to tensor<?x?xf32>
+  %3 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0]]>}
+      ins(%padded, %1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %3 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @pad_fusion(
+//       CHECK: %[[RESULT:.+]] = scf.forall (%[[ID0:.+]], %[[ID1:.+]])
+//       CHECK:   %[[PADDED:.+]] = tensor.pad
+//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
+//  CHECK-SAME:   ins(%[[PADDED]]