Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ using namespace llvm;

#define DEBUG_TYPE "loop-interchange"

/// @{
/// Metadata attribute names
static const char *const LLVMLoopInterchangeFollowupAll =
"llvm.loop.interchange.followup_all";
static const char *const LLVMLoopInterchangeFollowupNextOuter =
"llvm.loop.interchange.followup_next_outer";
static const char *const LLVMLoopInterchangeFollowupOuter =
"llvm.loop.interchange.followup_outer";
static const char *const LLVMLoopInterchangeFollowupInner =
"llvm.loop.interchange.followup_inner";
/// @}

STATISTIC(LoopsInterchanged, "Number of loops interchanged");

static cl::opt<int> LoopInterchangeCostThreshold(
Expand All @@ -65,6 +77,14 @@ static cl::opt<unsigned int> MaxMemInstrCount(
"in the dependency matrix. Higher value may lead to more interchanges "
"at the cost of compile-time"));

// Whether to apply by default.
// TODO: Once this pass is enabled by default, remove this option and use the
// value of PipelineTuningOptions.
static cl::opt<bool> OnlyWhenForced(
"loop-interchange-only-when-forced", cl::init(false), cl::ReallyHidden,
cl::desc(
"Apply interchanges only when explicitly specified metadata exists"));

namespace {

using LoopVector = SmallVector<Loop *, 8>;
Expand Down Expand Up @@ -297,6 +317,16 @@ static bool isComputableLoopNest(ScalarEvolution *SE,
return true;
}

static std::optional<bool> findMetadata(Loop *L) {
auto Value = findStringMetadataForLoop(L, "llvm.loop.interchange.enable");
if (!Value)
return std::nullopt;

const MDOperand *Op = *Value;
assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
return mdconst::extract<ConstantInt>(*Op)->getZExtValue();
}

namespace {

/// LoopInterchangeLegality checks if it is legal to interchange the loop.
Expand Down Expand Up @@ -504,6 +534,12 @@ struct LoopInterchange {
CostMap[LoopCosts[i].first] = i;
}
}

// If OnlyWhenForced is true, only process loops for which interchange is
// explicitly enabled.
if (OnlyWhenForced)
return processEnabledLoop(LoopList, DependencyMatrix, CostMap);

// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
// the innermost loop, move it outwards to the best possible position
Expand Down Expand Up @@ -532,6 +568,10 @@ struct LoopInterchange {
Loop *InnerLoop = LoopList[InnerLoopId];
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId << "\n");
if (findMetadata(OuterLoop) == false || findMetadata(InnerLoop) == false) {
LLVM_DEBUG(dbgs() << "Not interchanging loops. It is disabled.\n");
return false;
}
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
Expand Down Expand Up @@ -569,6 +609,152 @@ struct LoopInterchange {

return true;
}

bool processEnabledLoop(SmallVectorImpl<Loop *> &LoopList,
std::vector<std::vector<char>> &DependencyMatrix,
const DenseMap<const Loop *, unsigned> &CostMap) {
bool Changed = false;

// Manage the index so that LoopList[Loop2Index[L]] == L for each loop L.
DenseMap<Loop *, unsigned> Loop2Index;
for (unsigned I = 0; I != LoopList.size(); I++)
Loop2Index[LoopList[I]] = I;

// Hold outer loops to be exchanged (i.e., loops that have
// "llvm.loop.interchange.enable" is true), in the current nest order.
SmallVector<Loop *, 4> Worklist;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for the cases where the loops to be interchanged do not interfere with each other, I plan to handle them independently. That is, the following code

#pragma clang loop interchange(enable)
for (i=0; i<N; i++)
  for (j=0; j<N; j++)
    #pragma clang loop interchange(enable)
    for (k=0; k<N; k++)
      for (l=0; l<N; l++)
        ...

will be translated like as

!interchange_ij = !{!"interchange_ij", !interchange_enable}
!interchange_kl = !{!"interchange_kl", !interchange_enable}
!interchange_enable = !{!"llvm.loop.interchange.enable", i1 true}

not as follows.

!interchange_kl = !{!"interchange_kl", !interchange_enable, !followup_ij}
!interchange_enable = !{!"llvm.loop.interchange.enable", i1 true}
!followup_ij = !{"followup_next_next_outer", ...}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine, but also the less interesting case.


// Helper funciton to try to add a new loop into the Worklist. Return false
// if there is a duplicate in the loop to be interchanged.
auto AddLoopIfEnabled = [&](Loop *L) {
if (findMetadata(L) == true) {
if (!Worklist.empty()) {
// Because the loops are sorted in the order of the current nest, it
// is sufficient to compare with the last element.
unsigned InnerLoopId = Loop2Index[Worklist.back()] + 1;
unsigned OuterLoopId = Loop2Index[L];
if (OuterLoopId <= InnerLoopId) {
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "AmbiguousOrder",
L->getStartLoc(), L->getHeader())
<< "The loops to be interchanged are overlapping.";
});
return false;
}
}
Worklist.push_back(L);
}
return true;
};

// Initialize Worklist. To process the loops in inner-loop-first order, add
// them to the worklist in the outer-loop-first order.
for (unsigned I = 0; I != LoopList.size(); I++)
if (!AddLoopIfEnabled(LoopList[I]))
return Changed;

// Set an upper bound of the number of transformations to avoid infinite
// loop. There is no deep meaning behind the current value (square of the
// size of LoopList).
// TODO: Is this really necessary?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the metadata is properly remove after having applied the interchange, then not. Consider an assertion for debug builds when that happens.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 0e954b3

const unsigned MaxAttemptsCount = LoopList.size() * LoopList.size();
unsigned Attempts = 0;

// Process the loops. An exchange is applied to two loops, but a metadata
// replacement can be applied to three loops: the two loops plus the next
// outer loop, if it exists. This is because it's necessary to express the
// information about the order of the application of interchanges in cases
// where the target loops to be exchanged are overlapping, e.g.,
//
// #pragma clang loop interchange(enable)
// for(int i=0;i<N;i++)
// #pragma clang loop interchange(enable)
// for (int j=0;j<N;j++)
// for (int k=0;k<N;k++)
// ...
//
// In this case we will exchange the innermost two loops at first, the
// follow-up metadata including enabling interchange is attached on the
// outermost loop, and it is enqueued as the next candidate to be processed.
while (!Worklist.empty() && Attempts < MaxAttemptsCount) {
Loop *TargetLoop = Worklist.pop_back_val();
assert(findMetadata(TargetLoop) == true &&
"Some metadata was unexpectedlly removed");
unsigned OuterLoopId = Loop2Index[TargetLoop];
unsigned InnerLoopId = OuterLoopId + 1;
if (InnerLoopId >= LoopList.size()) {
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "InnermostLoop",
TargetLoop->getStartLoc(),
TargetLoop->getHeader())
<< "The metadata is invalid with an innermost loop.";
});
break;
}
MDNode *LoopID = TargetLoop->getLoopID();
bool Interchanged = processLoop(LoopList, InnerLoopId, OuterLoopId,
DependencyMatrix, CostMap);
if (!Interchanged) {
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "NotInterchanged",
TargetLoop->getStartLoc(),
TargetLoop->getHeader())
<< "Failed to perform explicitly specified loop interchange.";
});
break;
}

// The next outer loop, or nullptr if TargetLoop is the outermost one.
Loop *NextOuterLoop = nullptr;
if (0 < OuterLoopId)
NextOuterLoop = LoopList[OuterLoopId - 1];
Loop *OuterLoop = LoopList[OuterLoopId];
Loop *InnerLoop = LoopList[InnerLoopId];
Attempts++;
Changed = true;
Loop2Index[OuterLoop] = OuterLoopId;
Loop2Index[InnerLoop] = InnerLoopId;

// Update the metadata.
std::optional<MDNode *> MDNextOuterLoopID =
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
LLVMLoopInterchangeFollowupNextOuter});
std::optional<MDNode *> MDOuterLoopID =
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
LLVMLoopInterchangeFollowupOuter});
std::optional<MDNode *> MDInnerLoopID =
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
LLVMLoopInterchangeFollowupInner});
if (MDNextOuterLoopID) {
if (NextOuterLoop) {
NextOuterLoop->setLoopID(*MDNextOuterLoopID);
} else {
LLVM_DEBUG(dbgs()
<< "New metadata for the next outer loop is ignored.\n");
}
}
if (MDOuterLoopID)
OuterLoop->setLoopID(*MDOuterLoopID);
if (MDInnerLoopID)
InnerLoop->setLoopID(*MDInnerLoopID);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because this is inside the function processEnabledLoop, these metadata processes are not performed by default (i.e., OnlyWhenForced is false). However, if I understand it correctly, follow-up metadata will be generated whenever the pragma enabling interchange is specified, e.g., the following case:

// compilation options: -O3 -floop-interchange

#pragma clang loop interchange(enable) unroll(disable)  // Enabling interchange by pragma doesn't make sense, since loop-interchange is enabled by the compilation option.
for (int i = 0; i < N; i++)
  for (int j = 0; j < N; j++)
    ...

So should we also handle the follow-up metadata in processLoopList? If my understanding is correct, any other loop optimization passes (like unroll, distribute, etc.) don't handle them in the similar situation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Loops should be re-added to the worklist, so any followup loop interchange can be processed.

Other passes only apply just one transformation, usually because doing it multiple times is nonsensical. E.g. wou would not want to vectorize a loop twice.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I misunderstood other passes' behavior.

Loops should be re-added to the worklist, so any followup loop interchange can be processed.

Yes, this is already done.

My concern is how to handle metadata when we use the bubble sort algorithm. Do the changes in 7eff317 make sense?


// Add new elements, paying attention to the order.
bool Valid = true;
if (NextOuterLoop)
Valid &= AddLoopIfEnabled(NextOuterLoop);
Valid &= AddLoopIfEnabled(OuterLoop);
Valid &= AddLoopIfEnabled(InnerLoop);
if (!Valid)
break;
}

LLVM_DEBUG({
if (!Worklist.empty())
dbgs() << "Some metadata was ignored because the maximum number of "
"attempts was reached.\n";
});
return Changed;
}
};

} // end anonymous namespace
Expand Down
109 changes: 109 additions & 0 deletions llvm/test/Transforms/LoopInterchange/metadata-disable.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s

; Check that the interchange is not applied to the loop that is disabled by
; metadata. The original code is as below:
;
; for (int i=0; i<128; i++)
; for (int j=0; j<128; j++)
; #pragma clang loop interchange(disable)
; for (int k=0; k<128; k++)
; for (int l=0; l<128; l++)
; a[l][k][j][i]++;
;
; Since interchanges are not be applied to the k-loop, the pair (i, j) is the
; only candidate for exchange.

@a = dso_local local_unnamed_addr global [128 x [128 x [128 x [128 x i32]]]] zeroinitializer, align 4

define void @f() {
; CHECK-LABEL: define void @f() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[FOR_J_HEADER_PREHEADER:.*]]
; CHECK: [[FOR_I_HEADER_PREHEADER:.*]]:
; CHECK-NEXT: br label %[[FOR_I_HEADER:.*]]
; CHECK: [[FOR_I_HEADER]]:
; CHECK-NEXT: [[IV_I:%.*]] = phi i64 [ [[IV_I_NEXT:%.*]], %[[FOR_I_CLEANUP:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
; CHECK-NEXT: br label %[[FOR_K_HEADER:.*]]
; CHECK: [[FOR_J_HEADER_PREHEADER]]:
; CHECK-NEXT: br label %[[FOR_J_HEADER:.*]]
; CHECK: [[FOR_J_HEADER]]:
; CHECK-NEXT: [[IV_J:%.*]] = phi i64 [ [[IV_J_NEXT:%.*]], %[[FOR_J_CLEANUP:.*]] ], [ 0, %[[FOR_J_HEADER_PREHEADER]] ]
; CHECK-NEXT: br label %[[FOR_I_HEADER_PREHEADER]]
; CHECK: [[FOR_K_HEADER]]:
; CHECK-NEXT: [[IV_K:%.*]] = phi i64 [ 0, %[[FOR_I_HEADER]] ], [ [[IV_K_NEXT:%.*]], %[[FOR_K_CLEANUP:.*]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[IV_L:%.*]] = phi i64 [ 0, %[[FOR_K_HEADER]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 [[IV_L]], i64 [[IV_K]], i64 [[IV_J]], i64 [[IV_I]]
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[VAL]], 1
; CHECK-NEXT: store i32 [[INC]], ptr [[PTR]], align 4
; CHECK-NEXT: [[TMP0]] = add nuw nsw i64 [[IV_L]], 1
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 128
; CHECK-NEXT: br i1 [[TMP1]], label %[[FOR_K_CLEANUP]], label %[[FOR_BODY]]
; CHECK: [[FOR_K_CLEANUP]]:
; CHECK-NEXT: [[IV_K_NEXT]] = add nuw nsw i64 [[IV_K]], 1
; CHECK-NEXT: [[EXITCOND_K:%.*]] = icmp eq i64 [[IV_K_NEXT]], 128
; CHECK-NEXT: br i1 [[EXITCOND_K]], label %[[FOR_I_CLEANUP]], label %[[FOR_K_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[FOR_J_CLEANUP]]:
; CHECK-NEXT: [[IV_J_NEXT]] = add nuw nsw i64 [[IV_J]], 1
; CHECK-NEXT: [[EXITCOND_J:%.*]] = icmp eq i64 [[IV_J_NEXT]], 128
; CHECK-NEXT: br i1 [[EXITCOND_J]], label %[[EXIT:.*]], label %[[FOR_J_HEADER]]
; CHECK: [[FOR_I_CLEANUP]]:
; CHECK-NEXT: [[IV_I_NEXT]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i64 [[IV_I_NEXT]], 128
; CHECK-NEXT: br i1 [[EXITCOND_I]], label %[[FOR_J_CLEANUP]], label %[[FOR_I_HEADER]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %for.i.header

for.i.header:
%iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
br label %for.j.header

for.j.header:
%iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
br label %for.k.header

for.k.header:
%iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.k.cleanup ]
br label %for.body

for.body:
%iv.l = phi i64 [ 0, %for.k.header ], [ %iv.l.next, %for.body ]
%ptr = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 %iv.l, i64 %iv.k, i64 %iv.j, i64 %iv.i
%val = load i32, ptr %ptr, align 4
%inc = add nuw nsw i32 %val, 1
store i32 %inc, ptr %ptr, align 4
%iv.l.next = add nuw nsw i64 %iv.l, 1
%exitcond.l = icmp eq i64 %iv.l.next, 128
br i1 %exitcond.l, label %for.k.cleanup, label %for.body

for.k.cleanup:
%iv.k.next = add nuw nsw i64 %iv.k, 1
%exitcond.k = icmp eq i64 %iv.k.next, 128
br i1 %exitcond.k, label %for.j.cleanup, label %for.k.header, !llvm.loop !0

for.j.cleanup:
%iv.j.next = add nuw nsw i64 %iv.j, 1
%exitcond.j = icmp eq i64 %iv.j.next, 128
br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header

for.i.cleanup:
%iv.i.next = add nuw nsw i64 %iv.i, 1
%exitcond.i = icmp eq i64 %iv.i.next, 128
br i1 %exitcond.i, label %exit, label %for.i.header

exit:
ret void
}

!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.interchange.enable", i1 false}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.interchange.enable", i1 false}
;.
Loading
Loading