Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 71 additions & 28 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,33 @@ class LoopInterchangeLegality {
SmallVector<PHINode *, 8> InnerLoopInductions;
};

/// Manages information utilized by the profitability check for cache. The main
/// purpose of this class is to delay the computation of CacheCost until it is
/// actually needed.
class LoopInterchangeCacheCostManager {
Loop *OutermostLoop;
LoopStandardAnalysisResults *AR;
DependenceInfo *DI;

/// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
/// that the result can be nullptr.
std::optional<std::unique_ptr<CacheCost>> CC;

/// Maps each loop to an index representing the optimal position within the
/// loop-nest, as determined by the cache cost analysis.
DenseMap<const Loop *, unsigned> CostMap;

void computeIfUnitinialized();

public:
LoopInterchangeCacheCostManager(Loop *OutermostLoop,
LoopStandardAnalysisResults *AR,
DependenceInfo *DI)
: OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
std::unique_ptr<CacheCost> &getCacheCost();
const DenseMap<const Loop *, unsigned> &getCostMap();
};

/// LoopInterchangeProfitability checks if it is profitable to interchange the
/// loop.
class LoopInterchangeProfitability {
Expand All @@ -419,8 +446,7 @@ class LoopInterchangeProfitability {
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC);
LoopInterchangeCacheCostManager &LICCM);

private:
int getInstrOrderCost();
Expand Down Expand Up @@ -477,15 +503,15 @@ struct LoopInterchange {
LoopInfo *LI = nullptr;
DependenceInfo *DI = nullptr;
DominatorTree *DT = nullptr;
std::unique_ptr<CacheCost> CC = nullptr;
LoopStandardAnalysisResults *AR = nullptr;

/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
DominatorTree *DT, LoopStandardAnalysisResults *AR,
OptimizationRemarkEmitter *ORE)
: SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
: SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}

bool run(Loop *L) {
if (L->getParentLoop())
Expand Down Expand Up @@ -540,19 +566,7 @@ struct LoopInterchange {
}

unsigned SelecLoopId = selectLoopForInterchange(LoopList);
// Obtain the loop vector returned from loop cache analysis beforehand,
// and put each <Loop, index> pair into a map for constant time query
// later. Indices in loop vector reprsent the optimal order of the
// corresponding loop, e.g., given a loopnest with depth N, index 0
// indicates the loop should be placed as the outermost loop and index N
// indicates the loop should be placed as the innermost loop.
//
// For the old pass manager CacheCost would be null.
DenseMap<const Loop *, unsigned> CostMap;
if (CC != nullptr) {
for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
CostMap[Cost.first] = Idx;
}
LoopInterchangeCacheCostManager LICCM(LoopList[0], AR, DI);
// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
// the innermost loop, move it outwards to the best possible position
Expand All @@ -561,7 +575,7 @@ struct LoopInterchange {
bool ChangedPerIter = false;
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
bool Interchanged =
processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
processLoop(LoopList, i, i - 1, DependencyMatrix, LICCM);
ChangedPerIter |= Interchanged;
Changed |= Interchanged;
}
Expand All @@ -576,7 +590,7 @@ struct LoopInterchange {
bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
unsigned OuterLoopId,
std::vector<std::vector<char>> &DependencyMatrix,
const DenseMap<const Loop *, unsigned> &CostMap) {
LoopInterchangeCacheCostManager &LICCM) {
Loop *OuterLoop = LoopList[OuterLoopId];
Loop *InnerLoop = LoopList[InnerLoopId];
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
Expand All @@ -589,7 +603,7 @@ struct LoopInterchange {
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
DependencyMatrix, CostMap, CC)) {
DependencyMatrix, LICCM)) {
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
Expand Down Expand Up @@ -1122,6 +1136,36 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return true;
}

void LoopInterchangeCacheCostManager::computeIfUnitinialized() {
if (CC.has_value())
return;

LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
// Obtain the loop vector returned from loop cache analysis beforehand,
// and put each <Loop, index> pair into a map for constant time query
// later. Indices in loop vector reprsent the optimal order of the
// corresponding loop, e.g., given a loopnest with depth N, index 0
// indicates the loop should be placed as the outermost loop and index N
// indicates the loop should be placed as the innermost loop.
//
// For the old pass manager CacheCost would be null.
if (*CC != nullptr)
for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
CostMap[Cost.first] = Idx;
}

std::unique_ptr<CacheCost> &LoopInterchangeCacheCostManager::getCacheCost() {
computeIfUnitinialized();
return *CC;
}

const DenseMap<const Loop *, unsigned> &
LoopInterchangeCacheCostManager::getCostMap() {
computeIfUnitinialized();
return CostMap;
}

int LoopInterchangeProfitability::getInstrOrderCost() {
unsigned GoodOrder, BadOrder;
BadOrder = GoodOrder = 0;
Expand Down Expand Up @@ -1247,8 +1291,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
bool LoopInterchangeProfitability::isProfitable(
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
unsigned OuterLoopId, CharMatrix &DepMatrix,
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC) {
LoopInterchangeCacheCostManager &LICCM) {
// isProfitable() is structured to avoid endless loop interchange. If the
// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
// decide the profitability then, profitability check will stop and return the
Expand All @@ -1261,9 +1304,12 @@ bool LoopInterchangeProfitability::isProfitable(
std::optional<bool> shouldInterchange;
for (RuleTy RT : Profitabilities) {
switch (RT) {
case RuleTy::PerLoopCacheAnalysis:
case RuleTy::PerLoopCacheAnalysis: {
std::unique_ptr<CacheCost> &CC = LICCM.getCacheCost();
const DenseMap<const Loop *, unsigned> &CostMap = LICCM.getCostMap();
shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
break;
}
case RuleTy::PerInstrOrderCost:
shouldInterchange = isProfitablePerInstrOrderCost();
break;
Expand Down Expand Up @@ -1841,10 +1887,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
});

DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
std::unique_ptr<CacheCost> CC =
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);

if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
return PreservedAnalyses::all();
U.markLoopNestChanged(true);
return getLoopPassPreservedAnalyses();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
; REQUIRES: asserts

; RUN: opt -passes=loop-interchange -debug -disable-output %s 2>&1 | FileCheck %s

@A = global [16 x [16 x i32]] zeroinitializer

; Check that the CacheCost is calculated only when required. In this case, it
; is computed after passing the legality check.
;
; for (i = 0; i < 16; i++)
; for (j = 0; j < 16; j++)
; A[j][i] += 1;

; CHECK: Loops are legal to interchange
; CHECK: Compute CacheCost
define void @legal_to_interchange() {
entry:
br label %for.i.header

for.i.header:
%i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
br label %for.j

for.j:
%j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
%idx = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
%val = load i32, ptr %idx
%inc = add i32 %val, 1
store i32 %inc, ptr %idx
%j.next = add i32 %j, 1
%j.exit = icmp eq i32 %j.next, 16
br i1 %j.exit, label %for.i.latch, label %for.j

for.i.latch:
%i.next = add i32 %i, 1
%i.exit = icmp eq i32 %i.next, 16
br i1 %i.exit, label %exit, label %for.i.header

exit:
ret void
}

; Check that the CacheCost is not calculated when not required. In this case,
; the legality check always fails so that we do not need to compute the
; CacheCost.
;
; for (i = 0; i < 16; i++)
; for (j = 0; j < 16; j++)
; A[j][i] = A[i][j];

; CHECK-NOT: Compute CacheCost
define void @illegal_to_interchange() {
entry:
br label %for.i.header

for.i.header:
%i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
br label %for.j

for.j:
%j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
%idx.load = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %i, i32 %j
%idx.store = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
%val = load i32, ptr %idx.load
store i32 %val, ptr %idx.store
%j.next = add i32 %j, 1
%j.exit = icmp eq i32 %j.next, 16
br i1 %j.exit, label %for.i.latch, label %for.j

for.i.latch:
%i.next = add i32 %i, 1
%i.exit = icmp eq i32 %i.next, 16
br i1 %i.exit, label %exit, label %for.i.header

exit:
ret void
}
Loading