diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp index b38dd8effe669..74be68943ee98 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp @@ -1162,24 +1162,48 @@ struct GreedyFusion { } assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth"); - assert(!depthSliceUnions[bestDstLoopDepth - 1].isEmpty() && + + const ComputationSliceState &bestSlice = + depthSliceUnions[bestDstLoopDepth - 1]; + assert(!bestSlice.isEmpty() && "Fusion depth has no computed slice union"); + + // Do not perform sibling fusion if it isn't maximal. We always remove the + // sibling node and as such fusion shouldn't be performed if a part of the + // slice is used in the destination. + auto isMaximal = bestSlice.isMaximal(); + if (!isMaximal.value_or(false)) { + LLVM_DEBUG(llvm::dbgs() + << "Slice isn't maximal; not performing sibling fusion.\n"); + continue; + } + // Check if source loop is being inserted in the innermost // destination loop. Based on this, the fused loop may be optimized // further inside `fuseLoops`. bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest); // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'. - affine::fuseLoops(sibAffineForOp, dstAffineForOp, - depthSliceUnions[bestDstLoopDepth - 1], + affine::fuseLoops(sibAffineForOp, dstAffineForOp, bestSlice, isInnermostInsertion); auto dstForInst = cast(dstNode->op); // Update operation position of fused loop nest (if needed). - if (insertPointInst != dstForInst) { + if (insertPointInst != dstForInst) dstForInst->moveBefore(insertPointInst); - } + + LLVM_DEBUG(llvm::dbgs() + << "Fused sibling nest " << sibId << " into destination nest " + << dstNode->id << " at depth " << bestDstLoopDepth << ":\n" + << dstAffineForOp << "\n"); + // Update data dependence graph state post fusion. updateStateAfterSiblingFusion(sibNode, dstNode); + + // Remove old sibling loop nest. + // Get op before we invalidate the MDG node. + Operation *op = sibNode->op; + mdg->removeNode(sibNode->id); + op->erase(); } } @@ -1321,13 +1345,6 @@ struct GreedyFusion { mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts, dstLoopCollector.storeOpInsts, dstLoopCollector.memrefLoads, dstLoopCollector.memrefStores, dstLoopCollector.memrefFrees); - // Remove old sibling loop nest if it no longer has outgoing dependence - // edges, and it does not write to a memref which escapes the block. - if (mdg->getOutEdgeCount(sibNode->id) == 0) { - Operation *op = sibNode->op; - mdg->removeNode(sibNode->id); - op->erase(); - } } // Clean up any allocs with no users. diff --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir index 8fec24f71b14a..99207e4910462 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-2.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-2.mlir @@ -389,6 +389,8 @@ func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10 // ----- +// Test sibling fusion of two matrix-vector products sharing the input matrix. + func.func @two_matrix_vector_products() { %in_matrix = memref.alloc() : memref<10x10xf32> %in_vec0 = memref.alloc() : memref<10xf32> diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir index 2830235431c76..813b93a248e4a 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL +// All fusion: producer-consumer and sibling. +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV // Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir. @@ -108,6 +110,7 @@ func.func @check_src_dst_step(%m : memref<100xf32>, func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) { %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant 1.000000e+00 : f32 + // This nest writes to %arg1 but can be eliminated post sibling fusion. affine.for %arg3 = 0 to 1 { affine.for %arg4 = 0 to 64 { %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { @@ -137,11 +140,11 @@ func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : m // since the destination loop and source loop trip counts do not // match. // SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32 -// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32 -// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 { -// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { -// SIBLING-MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) { -// SIBLING-MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) { +// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32 +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 1 { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) { // ----- @@ -315,11 +318,16 @@ func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref< return } +// ----- + // PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores +// ALL-LABEL: func @same_memref_load_multiple_stores func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){ %cst = arith.constant 2.000000e+00 : f32 - // Source isn't removed. + // Ensure that source isn't removed during both producer-consumer fusion and + // sibling fusion. // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32 + // ALL: affine.for %{{.*}} = 0 to 32 affine.for %arg3 = 0 to 32 { %0 = affine.load %producer[%arg3] : memref<32xf32> %2 = arith.mulf %0, %cst : f32 @@ -343,5 +351,8 @@ func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %produce // PRODUCER-CONSUMER-NEXT: arith.addf // PRODUCER-CONSUMER-NEXT: affine.store // PRODUCER-CONSUMER-NEXT: } + // ALL: affine.for %{{.*}} = 0 to 16 + // ALL: mulf + // ALL: addf return } diff --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir index 1c119e87c5336..dcd2e1cdb275a 100644 --- a/mlir/test/Dialect/Affine/loop-fusion.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion.mlir @@ -1206,6 +1206,9 @@ func.func @should_fuse_with_private_memref() { // CHECK: affine.for %{{.*}} = 0 to 17 { // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> + // CHECK-NEXT: } + // CHECK: affine.for %{{.*}} = 0 to 82 { + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: } // CHECK-NEXT: return