[Codegen][GPU] Improve loop fusion pattern verification (#18671)

qedawkins · web-flow · commit cc3b28f27c5e · 2024-10-07T23:46:20.000Z
The current loop fusion patterns don't verify that the consumer loop won't be
predicated after resolution. This is required because the loop fusion pattern
introduces barrier semantics to the loop body which will result in invalid IR
if the loop resolves to an `scf.if` (or anything that could lead to thread divergence).

This also makes it so the loop fusion pattern does not require the consumer
loop to have a `tensor.extract_slice`, improving the robustness of the pattern.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
@@ -4,16 +4,21 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
@@ -36,36 +41,82 @@ struct FuseAndHoistParallelLoopsPass final
 };
 } // namespace
 
-struct FuseForalls final : OpRewritePattern<tensor::ExtractSliceOp> {
+static std::optional<int64_t> getStaticForallTripCount(scf::ForallOp forall) {
+  // TODO: Handle non-normalized loops.
+  if (!forall.isNormalized()) {
+    return std::nullopt;
+  }
+  int64_t tripCount = 1;
+  for (OpFoldResult ub : forall.getMixedUpperBound()) {
+    std::optional<int64_t> maybeConstantUb = getConstantIntValue(ub);
+    if (!maybeConstantUb) {
+      return std::nullopt;
+    }
+    tripCount *= *maybeConstantUb;
+  }
+  return tripCount;
+}
+
+static bool forallTripCountMatchesWorkgroupSize(scf::ForallOp forallOp,
+                                                int64_t flatWorkgroupSize) {
+  std::optional<int64_t> maybeTripCount = getStaticForallTripCount(forallOp);
+  if (!maybeTripCount) {
+    return false;
+  }
+
+  // For lane mapped foralls we need to verify that it is contained within
+  // a parent warp mapped op that combines to match the workggroup size.
+  if (forallOpHasMappingType<IREE::GPU::LaneIdAttr>(forallOp)) {
+    auto parentForall = forallOp->getParentOfType<scf::ForallOp>();
+    if (!parentForall ||
+        !forallOpHasMappingType<gpu::GPUWarpMappingAttr>(parentForall)) {
+      return false;
+    }
+
+    std::optional<int64_t> maybeParentTripCount =
+        getStaticForallTripCount(parentForall);
+    if (!maybeParentTripCount) {
+      return false;
+    }
+
+    return *maybeParentTripCount * *maybeTripCount == flatWorkgroupSize;
+  }
+
+  // All other loops must be mapped to threads to compare.
+  if (!forallOpHasMappingType<gpu::GPUThreadMappingAttr>(forallOp)) {
+    return false;
+  }
+
+  return *maybeTripCount == flatWorkgroupSize;
+}
+struct FuseForalls final : OpRewritePattern<scf::ForallOp> {
   using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
+  FuseForalls(MLIRContext *ctx, int64_t flatWorkgroupSize, PatternBenefit b = 1)
+      : OpRewritePattern<scf::ForallOp>(ctx, b),
+        flatWorkgroupSize(flatWorkgroupSize) {}
+  LogicalResult matchAndRewrite(scf::ForallOp producerForall,
                                 PatternRewriter &rewriter) const override {
-    auto sliceParent = sliceOp->getParentOfType<scf::ForallOp>();
-    if (!sliceParent) {
-      return failure();
+    if (!producerForall->hasOneUse()) {
+      return rewriter.notifyMatchFailure(producerForall,
+                                         "multi-use producer forall");
     }
 
-    SmallVector<Operation *> consumerChain = {sliceOp};
-    Operation *currProducer = sliceOp.getSource().getDefiningOp();
-    while (currProducer && !llvm::isa<scf::ForallOp>(currProducer) &&
-           currProducer->hasOneUse()) {
-      consumerChain.insert(consumerChain.begin(), currProducer);
-      currProducer =
-          llvm::TypeSwitch<Operation *, Operation *>(currProducer)
-              .Case<tensor::ExpandShapeOp>([](tensor::ExpandShapeOp expand) {
-                return expand.getSrc().getDefiningOp();
-              })
-              .Case<tensor::CollapseShapeOp>(
-                  [](tensor::CollapseShapeOp collapse) {
-                    return collapse.getSrc().getDefiningOp();
-                  })
-              .Default([](Operation *) { return nullptr; });
-    }
-
-    auto producerForall =
-        llvm::dyn_cast_if_present<scf::ForallOp>(currProducer);
-    if (!producerForall) {
-      return failure();
+    SmallVector<Operation *> consumerChain;
+    Operation *currProducer = *producerForall->user_begin();
+    while (currProducer && currProducer->hasOneUse()) {
+      consumerChain.push_back(currProducer);
+      if (!isa<tensor::ExpandShapeOp, tensor::CollapseShapeOp>(currProducer)) {
+        break;
+      }
+      currProducer = *currProducer->user_begin();
+    }
+
+    auto consumerForall = currProducer->getParentOfType<scf::ForallOp>();
+    if (!consumerForall || !forallTripCountMatchesWorkgroupSize(
+                               consumerForall, flatWorkgroupSize)) {
+      return rewriter.notifyMatchFailure(
+          producerForall,
+          "no consumer forall with trip count matching workgroup size");
     }
 
     // TODO: Allow extracting multiple uses within the same consumer loop. Still
@@ -75,9 +126,13 @@ struct FuseForalls final : OpRewritePattern<tensor::ExtractSliceOp> {
       return failure();
     }
 
-    return fuseForallIntoConsumer(rewriter, producerForall, sliceParent,
+    return fuseForallIntoConsumer(rewriter, producerForall, consumerForall,
                                   consumerChain);
   }
+
+private:
+  int64_t flatWorkgroupSize;
+  int64_t subgroupSize;
 };
 
 struct FuseTilableDestinationProducers final : OpRewritePattern<scf::ForallOp> {
@@ -198,12 +253,27 @@ void FuseAndHoistParallelLoopsPass::runOnOperation() {
 
   FunctionOpInterface funcOp = getOperation();
 
+  // Try to get the flat workgroup size if possible.
+  std::optional<int64_t> maybeFlatWorkgroupSize = std::nullopt;
+  if (std::optional<SmallVector<int64_t>> workgroupSize =
+          getWorkgroupSize(funcOp)) {
+    maybeFlatWorkgroupSize =
+        std::accumulate(workgroupSize->begin(), workgroupSize->end(), 1,
+                        std::multiplies<int64_t>());
+  }
+
   // First run the hoisting and fusion patterns.
   {
     RewritePatternSet patterns(context);
     // These two patterns are run to a fixed point, allowing fusion within
     // potentially nested loops, hoisting from said loops, and continued fusion.
-    patterns.add<FuseForalls>(context);
+    if (maybeFlatWorkgroupSize) {
+      // Forall fusion requires knowing the workgroup size to verify the fusion
+      // is valid. Without validation we risk putting barriers inside
+      // conditioned regions (e.g. scf.if/for).
+      patterns.add<FuseForalls>(context, *maybeFlatWorkgroupSize,
+                                /*benefit=*/1);
+    }
     patterns.add<FuseTilableForallConsumers>(context);
     populateForallLoopHoistingPattern(patterns);
     if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir
@@ -1,11 +1,14 @@
 // RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-fuse-and-hoist-parallel-loops))' --split-input-file | FileCheck %s
 
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 4)>
 #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)>
 #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)>
 #map4 = affine_map<(d0) -> (d0 * 16)>
-func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> {
+func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32>
+    attributes {translation_info = #translation_info} {
   %c4 = arith.constant 4 : index
   %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
@@ -62,11 +65,14 @@ func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf1
 
 // -----
 
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 4)>
 #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)>
 #map3 = affine_map<(d0) -> (d0 * 16)>
-func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> {
+func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32>
+    attributes {translation_info = #translation_info} {
   %c4 = arith.constant 4 : index
   %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
@@ -113,12 +119,15 @@ func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: te
 
 // -----
 
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 4)>
 #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)>
 #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)>
 #map4 = affine_map<(d0) -> (d0 * 16)>
-func.func @forall_fuse_then_hoist_with_fill(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>) -> tensor<128x128xf32> {
+func.func @forall_fuse_then_hoist_with_fill(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>) -> tensor<128x128xf32>
+    attributes {translation_info = #translation_info} {
   %c4 = arith.constant 4 : index
   %c128 = arith.constant 128 : index
   %c0 = arith.constant 0 : index
@@ -340,3 +349,103 @@ func.func @hoist_with_single_trip_loops(%2: tensor<128x128xf16>, %3: tensor<128x
 //       CHECK:     scf.forall.in_parallel
 //       CHECK:   scf.forall.in_parallel
 //       CHECK:   return
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 2)>
+#map1 = affine_map<(d0) -> (d0 * 16)>
+func.func @no_fuse_forall_without_workgroup_size(%arg0: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %0 = tensor.empty() : tensor<128x128xf32>
+  %2 = scf.forall (%arg5, %arg6) in (64, 1) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) {
+    %4 = affine.apply #map(%arg5)
+    %extracted_slice = tensor.extract_slice %arg0[%4, %arg6] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg7[%4, %arg6] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+    %5 = linalg.copy ins(%extracted_slice : tensor<2x128xf32>) outs(%extracted_slice_0 : tensor<2x128xf32>) -> tensor<2x128xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %5 into %arg7[%4, %arg6] [2, 128] [1, 1] : tensor<2x128xf32> into tensor<128x128xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) {
+    %6 = affine.apply #map1(%arg5)
+    %7 = affine.apply #map1(%arg6)
+    %extracted_slice_0 = tensor.extract_slice %2[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %8 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  return %3 : tensor<128x128xf32>
+}
+
+//   CHECK-LABEL: func @no_fuse_forall_without_workgroup_size
+// CHECK-COUNT-2:   scf.forall {{.*}} -> (tensor<128x128xf32>)
+
+// -----
+
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64>
+#map = affine_map<(d0) -> (d0 * 2)>
+#map1 = affine_map<(d0) -> (d0 * 16)>
+func.func @no_fuse_forall_workgroup_size_mismatch(%arg0: tensor<128x128xf32>) -> tensor<128x128xf32>
+  attributes {translation_info = #translation_info} {
+  %0 = tensor.empty() : tensor<128x128xf32>
+  %2 = scf.forall (%arg5, %arg6) in (128, 1) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) {
+    %4 = affine.apply #map(%arg5)
+    %extracted_slice = tensor.extract_slice %arg0[%4, %arg6] [1, 128] [1, 1] : tensor<128x128xf32> to tensor<1x128xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg7[%4, %arg6] [1, 128] [1, 1] : tensor<128x128xf32> to tensor<1x128xf32>
+    %5 = linalg.copy ins(%extracted_slice : tensor<1x128xf32>) outs(%extracted_slice_0 : tensor<1x128xf32>) -> tensor<1x128xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %5 into %arg7[%4, %arg6] [1, 128] [1, 1] : tensor<1x128xf32> into tensor<128x128xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  // We have 128 threads but only use 64 here, so loops cannot be fused.
+  %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) {
+    %6 = affine.apply #map1(%arg5)
+    %7 = affine.apply #map1(%arg6)
+    %extracted_slice_0 = tensor.extract_slice %2[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %8 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  return %3 : tensor<128x128xf32>
+}
+
+//   CHECK-LABEL: func @no_fuse_forall_workgroup_size_mismatch
+// CHECK-COUNT-2:   scf.forall {{.*}} -> (tensor<128x128xf32>)
+
+// -----
+
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+#map1 = affine_map<(d0) -> (d0 * 16)>
+func.func @fuse_direct_forall_use(%arg0: tensor<128x128xf32>, %arg1: tensor<16x16xf32>) -> tensor<128x128xf32>
+  attributes {translation_info = #translation_info} {
+  %0 = tensor.empty() : tensor<128x128xf32>
+  %1 = tensor.empty() : tensor<16x16xf32>
+  %2 = scf.forall (%arg5, %arg6) in (4, 4) shared_outs(%arg7 = %1) -> (tensor<16x16xf32>) {
+    %extracted_slice = tensor.extract_slice %arg1[%arg5, %arg6] [4, 4] [1, 1] : tensor<16x16xf32> to tensor<4x4xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg7[%arg5, %arg6] [4, 4] [1, 1] : tensor<16x16xf32> to tensor<4x4xf32>
+    %5 = linalg.copy ins(%extracted_slice : tensor<4x4xf32>) outs(%extracted_slice_0 : tensor<4x4xf32>) -> tensor<4x4xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %5 into %arg7[%arg5, %arg6] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<16x16xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) {
+    %6 = affine.apply #map1(%arg5)
+    %7 = affine.apply #map1(%arg6)
+    %extracted_slice_0 = tensor.extract_slice %arg0[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+    %8 = linalg.matmul ins(%2, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
+    }
+  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+  return %3 : tensor<128x128xf32>
+}
+
+//   CHECK-LABEL: func @fuse_direct_forall_use
+//       CHECK:   %[[FUSED_LOOP:.+]] = scf.forall
+//       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region
+//       CHECK:     linalg.matmul ins(%[[BARRIER]]
+//       CHECK:   return %[[FUSED_LOOP]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -400,7 +400,7 @@ hal.executable public @main {
 #config = #iree_gpu.lowering_config<{
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
-  subgroup = [2, 2],
+  subgroup = [1, 1],
   mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
   promote_operands = [0, 1]
 }>
@@ -440,8 +440,8 @@ hal.executable public @main {
 // CHECK-LABEL: func @matmul_transpose_b_mfma_32x32x16_i8
 //   CHECK-DAG:   memref.alloc() : memref<64x40xi8, #gpu.address_space<workgroup>>
 //   CHECK-DAG:   memref.alloc() : memref<64x40xi8, #gpu.address_space<workgroup>>
-//       CHECK:   scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x4x1xi32>)
-// CHECK-COUNT-8:   amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32
+//       CHECK:   scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<1x1x4x4x1xi32>)
+// CHECK-COUNT-2:   amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32
 //       CHECK:     scf.yield
 
 // -----