diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 277f530ee25fc..d1966011c4645 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -84,13 +85,16 @@ static void printDepMatrix(CharMatrix &DepMatrix) { static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, - ScalarEvolution *SE) { + ScalarEvolution *SE, LoopInfo *LI) { using ValueVector = SmallVector; ValueVector MemInstr; - // For each block. - for (BasicBlock *BB : L->blocks()) { + // Traverse blocks in fixed RPOT order, regardless of their storage in the + // loop info, as it may be arbitrary. + LoopBlocksRPO RPOT(L); + RPOT.perform(LI); + for (BasicBlock *BB : RPOT) { // Scan the BB and collect legal loads and stores. for (Instruction &I : *BB) { if (!isa(I)) @@ -115,18 +119,14 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { std::vector Dep; - Instruction *Src = cast(*I); - Instruction *Dst = cast(*J); + Instruction *Src = cast(*J); + Instruction *Dst = cast(*I); // Ignore Input dependencies. if (isa(Src) && isa(Dst)) continue; // Track Output, Flow, and Anti dependencies. if (auto D = DI->depends(Src, Dst, true)) { assert(D->isOrdered() && "Expected an output, flow or anti dep."); - // If the direction vector is negative, normalize it to - // make it non-negative. - if (D->normalize(SE)) - LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n"); LLVM_DEBUG(StringRef DepType = D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; dbgs() << "Found " << DepType @@ -438,7 +438,7 @@ struct LoopInterchange { CharMatrix DependencyMatrix; Loop *OuterMostLoop = *(LoopList.begin()); if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, - OuterMostLoop, DI, SE)) { + OuterMostLoop, DI, SE, LI)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; } diff --git a/llvm/test/Transforms/LoopInterchange/interchange-s231.ll b/llvm/test/Transforms/LoopInterchange/interchange-s231.ll new file mode 100644 index 0000000000000..32c865f276878 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/interchange-s231.ll @@ -0,0 +1,56 @@ +; REQUIRES: asserts +; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \ +; RUN: -S -debug 2>&1 | FileCheck %s + +@aa = global [256 x [256 x float]] zeroinitializer, align 64 +@bb = global [256 x [256 x float]] zeroinitializer, align 64 + +;; for (int nl = 0; nl < 10000000/256; nl++) +;; for (int i = 0; i < 256; ++i) +;; for (int j = 1; j < 256; j++) +;; aa[j][i] = aa[j - 1][i] + bb[j][i]; + +; CHECK: Processing InnerLoopId = 2 and OuterLoopId = 1 +; CHECK: Loops interchanged. + +define float @s231() { +entry: + br label %for.cond1.preheader + +; Loop: +for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 + %nl.036 = phi i32 [ 0, %entry ], [ %inc23, %for.cond.cleanup3 ] + br label %for.cond5.preheader + +for.cond.cleanup3: ; preds = %for.cond.cleanup7 + %inc23 = add nuw nsw i32 %nl.036, 1 + %exitcond41 = icmp ne i32 %inc23, 39062 + br i1 %exitcond41, label %for.cond1.preheader, label %for.cond.cleanup + +for.cond.cleanup7: ; preds = %for.body8 + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond40 = icmp ne i64 %indvars.iv.next39, 256 + br i1 %exitcond40, label %for.cond5.preheader, label %for.cond.cleanup3 + +for.body8: ; preds = %for.cond5.preheader, %for.body8 + %indvars.iv = phi i64 [ 1, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ] + %0 = add nsw i64 %indvars.iv, -1 + %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %0, i64 %indvars.iv38 + %1 = load float, ptr %arrayidx10, align 4 + %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv38 + %2 = load float, ptr %arrayidx14, align 4 + %add = fadd fast float %2, %1 + %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv38 + store float %add, ptr %arrayidx18, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.body8, label %for.cond.cleanup7 + +for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7 + %indvars.iv38 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next39, %for.cond.cleanup7 ] + br label %for.body8 + +; Exit blocks +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret float undef +} diff --git a/llvm/test/Transforms/LoopInterchange/pr56275.ll b/llvm/test/Transforms/LoopInterchange/pr56275.ll index c6078bb45146b..86a4fcc7b2140 100644 --- a/llvm/test/Transforms/LoopInterchange/pr56275.ll +++ b/llvm/test/Transforms/LoopInterchange/pr56275.ll @@ -21,20 +21,14 @@ target triple = "aarch64-unknown-linux-gnu" define void @test1(ptr noalias noundef %a, ptr noalias noundef %b, ptr noalias noundef %c) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP2_HEADER_PREHEADER:%.*]] -; CHECK: loop1.header.preheader: ; CHECK-NEXT: br label [[LOOP1_HEADER:%.*]] ; CHECK: loop1.header: -; CHECK-NEXT: [[I2:%.*]] = phi i64 [ [[I2_INC:%.*]], [[LOOP1_LATCH:%.*]] ], [ 1, [[LOOP1_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: [[I2:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[I2_INC:%.*]], [[LOOP1_LATCH:%.*]] ] ; CHECK-NEXT: [[I2_ST:%.*]] = add i64 [[I2]], 1 ; CHECK-NEXT: [[I2_LD:%.*]] = add i64 [[I2]], 0 -; CHECK-NEXT: br label [[LOOP2_HEADER_SPLIT1:%.*]] -; CHECK: loop2.header.preheader: ; CHECK-NEXT: br label [[LOOP2_HEADER:%.*]] ; CHECK: loop2.header: -; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[TMP0:%.*]], [[LOOP2_HEADER_SPLIT:%.*]] ], [ 1, [[LOOP2_HEADER_PREHEADER]] ] -; CHECK-NEXT: br label [[LOOP1_HEADER_PREHEADER]] -; CHECK: loop2.header.split1: +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ 1, [[LOOP1_HEADER]] ], [ [[I1_INC:%.*]], [[LOOP2_HEADER]] ] ; CHECK-NEXT: [[I1_ST:%.*]] = add i64 [[I1]], 0 ; CHECK-NEXT: [[I1_LD:%.*]] = add i64 [[I1]], 0 ; CHECK-NEXT: [[A_ST:%.*]] = getelementptr inbounds [64 x i32], ptr [[A:%.*]], i64 [[I1_ST]], i64 [[I2_ST]] @@ -45,17 +39,13 @@ define void @test1(ptr noalias noundef %a, ptr noalias noundef %b, ptr noalias n ; CHECK-NEXT: store i32 [[B_VAL]], ptr [[A_ST]], align 4 ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A_LD]], align 4 ; CHECK-NEXT: store i32 [[A_VAL]], ptr [[C_ST]], align 4 -; CHECK-NEXT: [[I1_INC:%.*]] = add nuw nsw i64 [[I1]], 1 +; CHECK-NEXT: [[I1_INC]] = add nuw nsw i64 [[I1]], 1 ; CHECK-NEXT: [[LOOP2_EXITCOND_NOT:%.*]] = icmp eq i64 [[I1_INC]], 63 -; CHECK-NEXT: br label [[LOOP1_LATCH]] -; CHECK: loop2.header.split: -; CHECK-NEXT: [[TMP0]] = add nuw nsw i64 [[I1]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 63 -; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT:%.*]], label [[LOOP2_HEADER]] +; CHECK-NEXT: br i1 [[LOOP2_EXITCOND_NOT]], label [[LOOP1_LATCH]], label [[LOOP2_HEADER]] ; CHECK: loop1.latch: ; CHECK-NEXT: [[I2_INC]] = add nuw nsw i64 [[I2]], 1 ; CHECK-NEXT: [[LOOP1_EXITCOND_NOT:%.*]] = icmp eq i64 [[I2_INC]], 63 -; CHECK-NEXT: br i1 [[LOOP1_EXITCOND_NOT]], label [[LOOP2_HEADER_SPLIT]], label [[LOOP1_HEADER]] +; CHECK-NEXT: br i1 [[LOOP1_EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP1_HEADER]] ; CHECK: exit: ; CHECK-NEXT: ret void ;