[Codegen][GPU] Enable destination fusion for unit trip loops (#18674)

qedawkins · web-flow · commit f6a9b6ba68ed · 2024-10-08T10:37:22.000-04:00
When doing loop fusion + hoisting, this introduces new fusion
opportunities after earlier canonicalization steps can have kicked in.
This causes problems for unit trip loops where the slice on the
destination will get folded away. This adds a pattern to move any dps
ops into the body of the forall loop in such cases because unit trip
loops are equivalent to single threaded regions.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
@@ -132,7 +132,6 @@ struct FuseForalls final : OpRewritePattern<scf::ForallOp> {
 
 private:
   int64_t flatWorkgroupSize;
-  int64_t subgroupSize;
 };
 
 struct FuseTilableDestinationProducers final : OpRewritePattern<scf::ForallOp> {
@@ -174,6 +173,68 @@ struct FuseTilableDestinationProducers final : OpRewritePattern<scf::ForallOp> {
   }
 };
 
+struct FuseUnitLoopDestination final : OpRewritePattern<scf::ForallOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(scf::ForallOp forallOp,
+                                PatternRewriter &rewriter) const override {
+    std::optional<int64_t> maybeTripCount = getStaticForallTripCount(forallOp);
+    if (!maybeTripCount || *maybeTripCount != 1) {
+      return rewriter.notifyMatchFailure(forallOp,
+                                         "not a unit trip count loop");
+    }
+    DestinationStyleOpInterface dpsProducer;
+    BlockArgument bodyArg;
+    Value dpsResult;
+    for (auto iterArg : forallOp.getRegionIterArgs()) {
+      dpsResult = forallOp.getTiedLoopInit(iterArg)->get();
+      bodyArg = iterArg;
+      dpsProducer = dpsResult.getDefiningOp<DestinationStyleOpInterface>();
+      if (dpsProducer) {
+        break;
+      }
+    }
+    if (!dpsProducer || !dpsProducer->hasOneUse()) {
+      return rewriter.notifyMatchFailure(forallOp,
+                                         "no single use DPS producer");
+    }
+
+    Operation *parallelInsert = nullptr;
+    for (auto user : bodyArg.getUsers()) {
+      if (isa<tensor::ParallelInsertSliceOp>(user)) {
+        // This should be illegal but check anyway.
+        if (parallelInsert) {
+          return rewriter.notifyMatchFailure(forallOp, "multiple insert users");
+        }
+        parallelInsert = user;
+      }
+    }
+    if (!parallelInsert) {
+      return rewriter.notifyMatchFailure(
+          forallOp, "destination not used by a parallel insert");
+    }
+
+    rewriter.startOpModification(forallOp);
+    // Move the producer into the body of the forall loop.
+    rewriter.moveOpBefore(dpsProducer, forallOp.getBody(),
+                          forallOp.getBody()->begin());
+
+    // Replace all uses of the region iter arg with the moved dps op.
+    rewriter.replaceAllUsesExcept(bodyArg, dpsResult, parallelInsert);
+
+    // Set the init operand of the forall op to the init operand of the
+    // producer.
+    int64_t dpsInitIndex = cast<OpResult>(dpsResult).getResultNumber();
+    forallOp->setOperand(forallOp.getTiedOpOperand(bodyArg)->getOperandNumber(),
+                         dpsProducer.getDpsInitOperand(dpsInitIndex)->get());
+
+    // Finally replace the init operand of the moved producer with the region
+    // iter arg.
+    dpsProducer.setDpsInitOperand(dpsInitIndex, bodyArg);
+    rewriter.finalizeOpModification(forallOp);
+    return success();
+  }
+};
+
 struct FuseTilableSliceProducers final
     : OpRewritePattern<tensor::ExtractSliceOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -290,6 +351,7 @@ void FuseAndHoistParallelLoopsPass::runOnOperation() {
   {
     RewritePatternSet patterns(context);
     patterns.add<FuseTilableDestinationProducers>(context);
+    patterns.add<FuseUnitLoopDestination>(context);
     patterns.add<FuseTilableForallConsumers>(context);
     tensor::populateFoldTensorEmptyPatterns(patterns);
     scf::ForallOp::getCanonicalizationPatterns(patterns, context);
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir
@@ -449,3 +449,40 @@ func.func @fuse_direct_forall_use(%arg0: tensor<128x128xf32>, %arg1: tensor<16x1
 //       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region
 //       CHECK:     linalg.matmul ins(%[[BARRIER]]
 //       CHECK:   return %[[FUSED_LOOP]]
+
+// -----
+
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+
+func.func @forall_hoist_unit_loop_with_fill(%3: tensor<1x128xf16>, %4: tensor<128x1xf16>) -> tensor<1x1xf32>
+    attributes {translation_info = #translation_info} {
+  %c4 = arith.constant 4 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %empty = tensor.empty() : tensor<1x1xf32>
+  %cst = arith.constant 0.0 : f32
+  %5 = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<1x1xf32>) {
+    %11 = scf.forall (%arg2, %arg3) in (1, 1) shared_outs(%arg4 = %arg1) -> (tensor<1x1xf32>) {
+      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0)
+      %extracted_slice = tensor.extract_slice %3[0, %12] [1, 4] [1, 1] : tensor<1x128xf16> to tensor<1x4xf16>
+      %extracted_slice_0 = tensor.extract_slice %4[%12, 0] [4, 1] [1, 1] : tensor<128x1xf16> to tensor<4x1xf16>
+      %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<1x4xf16>, tensor<4x1xf16>) outs(%arg4 : tensor<1x1xf32>) -> tensor<1x1xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %14 into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+      }
+    } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
+    scf.yield %11 : tensor<1x1xf32>
+  }
+  return %8 : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: func @forall_hoist_unit_loop_with_fill
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<1x1xf32>
+//       CHECK:   %[[OUTER_PARALLEL:.+]] = scf.forall {{.*}}shared_outs(%[[ITER:.+]] = %[[EMPTY]])
+//       CHECK:     %[[FILL:.+]] = linalg.fill {{.*}} outs(%[[ITER]]
+//       CHECK:     %[[LOOP:.+]] = scf.for {{.*}} iter_args(%{{.*}} = %[[FILL]])
+//       CHECK:     scf.yield {{.*}} : tensor<1x1xf32>
+//       CHECK:   scf.forall.in_parallel
+//  CHECK-NEXT:     tensor.parallel_insert_slice %[[LOOP]] into %[[ITER]]
+//       CHECK:   return %[[OUTER_PARALLEL]]