diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index b5eb647a042b9..2073303237f69 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -100,6 +100,7 @@ STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumDA, "DA checks passed"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -1371,6 +1372,47 @@ struct LoopFuser { << "\n"); } #endif + unsigned Levels = DepResult->getLevels(); + unsigned SameSDLevels = DepResult->getSameSDLevels(); + unsigned CurLoopLevel = FC0.L->getLoopDepth(); + + // Check if DA is missing info regarding the current loop level + if (CurLoopLevel > Levels + SameSDLevels) + return false; + + // Iterating over the outer levels. + for (unsigned Level = 1; Level <= std::min(CurLoopLevel - 1, Levels); + ++Level) { + unsigned Direction = DepResult->getDirection(Level, false); + + // Check if the direction vector does not include equality. If an outer + // loop has a non-equal direction, outer indicies are different and it + // is safe to fuse. + if (!(Direction & Dependence::DVEntry::EQ)) { + LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " + "outer loops\n"); + NumDA++; + return true; + } + } + + assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); + + unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); + + // Check if the direction vector does not include greater direction. In + // that case, the dependency is not a backward loop-carried and is legal + // to fuse. For example here we have a forward dependency + // for (int i = 0; i < n; i++) + // A[i] = ...; + // for (int i = 0; i < n; i++) + // ... = A[i-1]; + if (!(CurDir & Dependence::DVEntry::GT)) { + LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " + "dependency\n"); + NumDA++; + return true; + } if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) LLVM_DEBUG( diff --git a/llvm/test/Transforms/LoopFusion/da_separate_loops.ll b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll new file mode 100644 index 0000000000000..6359f48199290 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll @@ -0,0 +1,182 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-fusion -da-disable-delinearization-checks -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 2 loop-fusion - DA checks passed + +; The two inner loops have no dependency and are allowed to be fused as in the +; outer loops, different levels are accessed to. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i + 3][j + 2][k + 1]; +;; } +;; } + +define void @nonequal_outer_access(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, 1 + %add13 = add nsw i64 %j.07, 2 + %add14 = add nsw i64 %i.011, 3 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} + +; The two inner loops have a forward loop-carried dependency, allowing them +; to be fused. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i][j][k - 1]; +;; } +;; } + +define void @forward_dep(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, -1 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll index d63890df14461..f3cd5877bd4aa 100644 --- a/llvm/test/Transforms/LoopFusion/simple.ll +++ b/llvm/test/Transforms/LoopFusion/simple.ll @@ -298,42 +298,55 @@ bb23: ; preds = %bb17, %bb ret void } +; The following IR is a representation of the provided code below. With PR +; #146383, loop fusion is able to utilize the information from dependence +; analysis, enabling the loops in the function to be fused. +; +; void forward_dep(int *arg) { +; for (int i = 0; i < 100; i++) { +; int tmp = i - 3; +; int val = tmp * (i + 3) % i; +; arg[i] = val; +; } +; +; for (int j = 0; j < 100; j++) { +; int val = arg[j - 3]; +; arg[j] = val * 3; +; } +; } +; define void @forward_dep(ptr noalias %arg) { ; CHECK-LABEL: @forward_dep( -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB7:.*]] ; CHECK: bb7: -; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ] -; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ] +; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ] +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ] ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[DOT013]], -3 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]] ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP13]], align 4 -; CHECK-NEXT: br label [[BB14]] +; CHECK-NEXT: br label %[[BB14:.*]] ; CHECK: bb14: -; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 -; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 -; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 -; CHECK-NEXT: br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]] -; CHECK: bb19.preheader: -; CHECK-NEXT: br label [[BB19:%.*]] -; CHECK: bb19: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ] ; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]] ; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 -; CHECK-NEXT: br label [[BB25]] +; CHECK-NEXT: br label %[[BB25]] ; CHECK: bb25: +; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 +; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]] ; CHECK: bb26: ; CHECK-NEXT: ret void ;