Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 67 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(200), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdNestedStatic(
"amdgpu-unroll-threshold-nested-static",
cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
"trip count will be made runtime-independent when fully-unrolling "
"the outer loop"),
cl::init(200), cl::Hidden);

static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
Expand Down Expand Up @@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
}

unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);

if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
// Look for subloops whose trip count would go from runtime-dependent to
// runtime-independent if we were to unroll the loop. Give a bonus to the
// current loop's unrolling threshold for each of these, as fully unrolling
// it would likely expose additional optimization opportunities.
for (const Loop *SubLoop : L->getSubLoops()) {
std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
if (!Bounds)
continue;
Value *InitIV = &Bounds->getInitialIVValue();
Value *FinalIV = &Bounds->getFinalIVValue();
Value *StepVal = Bounds->getStepValue();
if (!StepVal)
continue;

// Determines whether SubIV's derivation depends exclusively on constants
// and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
// involved in the derivation.
bool SubIVDependsOnIV = false;
std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
[&](const Value *SubIV, unsigned Depth) -> bool {
if (SubIV == IV) {
SubIVDependsOnIV = true;
return true;
}
if (isa<Constant>(SubIV))
return true;
if (Depth >= 10)
return false;

const Instruction *I = dyn_cast<Instruction>(SubIV);
// No point in checking outside the loop since IV is necessarily inside
// it; also stop searching when encountering an instruction that will
// likely not allow SubIV's value to be statically computed.
if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
return false;

// SubIV depends on constants or IV if all of the instruction's
// operands involved in its derivation also depend on constants or IV.
return llvm::all_of(I->operand_values(), [&](const Value *V) {
return FromConstsOrLoopIV(V, Depth + 1);
});
};

if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
UP.Threshold += UnrollThresholdNestedStatic;
LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
<< " for loop:\n"
<< *L
<< " due to subloop's trip count becoming "
"runtime-independent after unrolling:\n "
<< *SubLoop);
if (UP.Threshold >= MaxBoost)
return;
}
}
}

for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getDataLayout();
unsigned LocalGEPsSeen = 0;
Expand Down
84 changes: 84 additions & 0 deletions llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s

; For @dependent_sub_fullunroll, the threshold bonus should apply
; CHECK: due to subloop's trip count becoming runtime-independent after unrolling

; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling

; Check that the outer loop of a double-nested loop where the inner loop's trip
; count depends exclusively on constants and the outer IV is fully unrolled
; thanks to receiving a threshold bonus in AMDGPU's TTI.

; CHECK-LABEL: @dependent_sub_fullunroll
; CHECK: inner.header_latch_exiting.7
; CHECK: outer.latch_exiting.7

define void @dependent_sub_fullunroll(ptr noundef %mem) {
entry:
br label %outer.header

outer.header: ; preds = %entry, %outer.latch_exiting
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
br label %inner.header_latch_exiting

inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, 8
br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1

outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
%outer.cond = icmp ult i32 %outer.iv_next, 8
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1

end: ; preds = %outer.latch_exiting
ret void
}

; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
; is not fully unrolled when the inner loop's final IV value depends on a
; function argument instead of a combination of the outer IV and constants.

; CHECK-LABEL: @dependent_sub_no_fullunroll
; CHECK-NOT: outer.latch_exiting.7
; CHECK-NOT: outer.latch_exiting.7

define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
entry:
br label %outer.header

outer.header: ; preds = %entry, %outer.latch_exiting
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
br label %inner.header_latch_exiting

inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%outer.iv.ext = zext nneg i32 %outer.iv to i64
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
store i32 0, ptr %addr
%inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1

outer.latch_exiting: ; preds = %inner.header_latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
%outer.cond = icmp ult i32 %outer.iv_next, 8
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1

end: ; preds = %outer.latch_exiting
ret void
}

!1 = !{!1, !2}
!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}
Loading