Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,11 @@ class LSRUse {
/// the loop, in which case some special-case heuristics may be used.
bool AllFixupsOutsideLoop = true;

/// This records whether all of the fixups using this LSRUse are unconditional
/// within the loop, meaning they will be executed in every iteration of the
/// loop.
bool AllFixupsUnconditional = true;

/// RigidFormula is set to true to guarantee that this use will be associated
/// with a single formula--the one that initially matched. Some SCEV
/// expressions cannot be expanded. This allows LSR to consider the registers
Expand Down Expand Up @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
const APInt *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
SE->isLoopInvariant(Start, L)))
bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
F.BaseOffset.isFixed() &&
*Step == F.BaseOffset.getFixedValue();
bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
!isa<SCEVConstant>(Start) &&
SE->isLoopInvariant(Start, L);
// We can only pre or post index when the load/store is unconditional.
if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
LoopCost = 0;
}
}

// If the loop counts down to zero and we'll be using a hardware loop then
// the addrec will be combined into the hardware loop instruction.
if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
Expand Down Expand Up @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const {
if (AllFixupsOutsideLoop)
OS << ", all-fixups-outside-loop";

if (AllFixupsUnconditional)
OS << ", all-fixups-unconditional";

if (WidestFixupType)
OS << ", widest fixup type: " << *WidestFixupType;
}
Expand Down Expand Up @@ -2213,6 +2227,7 @@ class LSRInstance {
void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);
bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;

void CollectLoopInvariantFixupsAndFormulae();

Expand Down Expand Up @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);

// Create SCEV as Formula for calculating baseline cost
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
Expand Down Expand Up @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
return true;
}

/// Test whether this fixup will be executed each time the corresponding IV
/// increment instruction is executed.
bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
// If the fixup block dominates the IV increment block then there is no path
// through the loop to the increment that doesn't pass through the fixup.
return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
}

/// Check for other uses of loop-invariant values which we're tracking. These
/// other uses will pin these values in registers, making them less profitable
/// for elimination.
Expand Down Expand Up @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
LF.OperandValToReplace = U;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
Expand Down Expand Up @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');

LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;

// Transfer the fixups of LU to LUThatHas.
for (LSRFixup &Fixup : LU.Fixups) {
Expand Down
70 changes: 37 additions & 33 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: subs.w r9, r1, #1
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: and r8, r9, #3
; CHECK-NEXT: and r6, r9, #3
; CHECK-NEXT: subs r7, r1, #2
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: b .LBB0_6
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: cbnz r6, .LBB0_7
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new
; CHECK-NEXT: bic r7, r9, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: movs r7, #4
; CHECK-NEXT: .LBB0_5: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r10, [r0, #16]!
; CHECK-NEXT: sub.w r9, r9, #4
; CHECK-NEXT: ldrd r5, r4, [r0, #-12]
; CHECK-NEXT: ldr r11, [r0, #-4]
; CHECK-NEXT: ldr r11, [r0, #16]!
; CHECK-NEXT: ldrd r5, r7, [r0, #-12]
; CHECK-NEXT: ldr r4, [r0, #-4]
; CHECK-NEXT: cmp r12, r5
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #3
; CHECK-NEXT: csel r5, r5, r12, gt
; CHECK-NEXT: cmp r5, r4
; CHECK-NEXT: csinc r6, r10, r8, le
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #2
; CHECK-NEXT: csel r5, r4, r5, gt
; CHECK-NEXT: cmp r5, r11
; CHECK-NEXT: addgt.w r6, r8, #2
; CHECK-NEXT: csel r7, r7, r5, gt
; CHECK-NEXT: cmp r7, r4
; CHECK-NEXT: it gt
; CHECK-NEXT: subgt r6, r7, #1
; CHECK-NEXT: csel r5, r11, r5, gt
; CHECK-NEXT: cmp r5, r10
; CHECK-NEXT: csel r6, r7, r6, gt
; CHECK-NEXT: add.w r7, r7, #4
; CHECK-NEXT: csel r12, r10, r5, gt
; CHECK-NEXT: addgt.w r6, r8, #3
; CHECK-NEXT: csel r7, r4, r7, gt
; CHECK-NEXT: add.w r8, r8, #4
; CHECK-NEXT: cmp r7, r11
; CHECK-NEXT: csel r10, r8, r6, gt
; CHECK-NEXT: csel r12, r11, r7, gt
; CHECK-NEXT: le lr, .LBB0_5
; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.7: @ %while.body.epil
; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload
; CHECK-NEXT: sub.w r9, r9, r8
; CHECK-NEXT: cbz r6, .LBB0_10
; CHECK-NEXT: .LBB0_7: @ %while.body.epil
; CHECK-NEXT: ldr r7, [r0, #4]
; CHECK-NEXT: sub.w r1, r1, r9
; CHECK-NEXT: cmp r12, r7
; CHECK-NEXT: csel r6, r1, r6, gt
; CHECK-NEXT: csel r10, r1, r10, gt
; CHECK-NEXT: csel r12, r7, r12, gt
; CHECK-NEXT: cmp.w r8, #1
; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1
; CHECK-NEXT: ldr r7, [r0, #8]
; CHECK-NEXT: cmp r12, r7
; CHECK-NEXT: csinc r6, r6, r1, le
; CHECK-NEXT: csinc r10, r10, r1, le
; CHECK-NEXT: csel r12, r7, r12, gt
; CHECK-NEXT: cmp.w r8, #2
; CHECK-NEXT: cmp r6, #2
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2
; CHECK-NEXT: ldr r0, [r0, #12]
; CHECK-NEXT: cmp r12, r0
; CHECK-NEXT: it gt
; CHECK-NEXT: addgt r6, r1, #2
; CHECK-NEXT: addgt.w r10, r1, #2
; CHECK-NEXT: csel r12, r0, r12, gt
; CHECK-NEXT: .LBB0_10: @ %while.end
; CHECK-NEXT: str.w r12, [r2]
; CHECK-NEXT: str r6, [r3]
; CHECK-NEXT: str.w r10, [r3]
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%0 = load i32, ptr %pSrc, align 4
Expand Down
144 changes: 140 additions & 4 deletions llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
Original file line number Diff line number Diff line change
Expand Up @@ -119,31 +119,29 @@ for.end:
; We can't use postindex addressing on the conditional load of qval and can't
; convert the loop condition to a compare with zero, so we should instead use
; offset addressing.
; FIXME: Currently we don't notice the load of qval is conditional, and attempt
; postindex addressing anyway.
define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-LABEL: define i32 @conditional_load(
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[PVAL]], 0
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
; CHECK: [[IF_THEN]]:
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 2
; CHECK-NEXT: [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]]
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
; CHECK-NEXT: br label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
Expand Down Expand Up @@ -176,3 +174,141 @@ for.inc:
exit:
ret i32 %ret.next
}

; We can use postindex addressing for both loads here, even though the second
; may not be executed on every loop iteration.
define i32 @early_exit_load(ptr %p, ptr %q, ptr %n) {
; CHECK-LABEL: define i32 @early_exit_load(
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
; CHECK-NEXT: [[RET_PHI:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[PVAL]], 0
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[EXIT:.*]]
; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
; CHECK-NEXT: [[ADD]] = add nsw i32 [[QVAL]], [[RET_PHI]]
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY]], label %[[EXIT]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_PHI]], %[[FOR_BODY]] ], [ [[ADD]], %[[FOR_INC]] ]
; CHECK-NEXT: ret i32 [[RET]]
;
entry:
br label %for.body

for.body:
%ret.phi = phi i32 [ %add, %for.inc ], [ 0, %entry ]
%idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
%paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
%pval = load i32, ptr %paddr, align 4
%cmp1 = icmp eq i32 %pval, 0
br i1 %cmp1, label %for.inc, label %exit

for.inc:
%qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
%qval = load i32, ptr %qaddr, align 4
%add = add nsw i32 %qval, %ret.phi
%idx.next = add nuw nsw i64 %idx, 1
%nval = load volatile i64, ptr %n, align 8
%cmp2 = icmp slt i64 %idx.next, %nval
br i1 %cmp2, label %for.body, label %exit

exit:
%ret = phi i32 [ %ret.phi, %for.body ], [ %add, %for.inc ]
ret i32 %ret
}

; The control-flow before and after the load of qval shouldn't prevent postindex
; addressing from happening.
; FIXME: We choose postindex addressing, but the scevgep is placed in for.inc so
; during codegen we will fail to actually generate a postindex load.
define void @middle_block_load(ptr %p, ptr %q, i64 %n) {
; CHECK-LABEL: define void @middle_block_load(
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_INC]] ], [ [[N]], %[[ENTRY]] ]
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV2]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[PVAL]], 0
; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4
; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN1:.*]], label %[[IF_ELSE1:.*]]
; CHECK: [[IF_THEN1]]:
; CHECK-NEXT: tail call void @otherfn1()
; CHECK-NEXT: br label %[[IF_END:.*]]
; CHECK: [[IF_ELSE1]]:
; CHECK-NEXT: tail call void @otherfn2()
; CHECK-NEXT: br label %[[IF_END]]
; CHECK: [[IF_END]]:
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[QVAL]], 0
; CHECK-NEXT: br i1 [[CMP2]], label %[[IF_THEN2:.*]], label %[[IF_ELSE2:.*]]
; CHECK: [[IF_THEN2]]:
; CHECK-NEXT: tail call void @otherfn1()
; CHECK-NEXT: br label %[[FOR_INC]]
; CHECK: [[IF_ELSE2]]:
; CHECK-NEXT: tail call void @otherfn2()
; CHECK-NEXT: br label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
; CHECK-NEXT: br i1 [[CMP3]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %for.body

for.body:
%idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
%paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
%pval = load i32, ptr %paddr, align 4
%cmp1 = icmp sgt i32 %pval, 0
br i1 %cmp1, label %if.then1, label %if.else1

if.then1:
tail call void @otherfn1()
br label %if.end

if.else1:
tail call void @otherfn2()
br label %if.end

if.end:
%qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
%qval = load i32, ptr %qaddr, align 4
%cmp2 = icmp sgt i32 %qval, 0
br i1 %cmp2, label %if.then2, label %if.else2

if.then2:
tail call void @otherfn1()
br label %for.inc

if.else2:
tail call void @otherfn2()
br label %for.inc

for.inc:
%idx.next = add nuw nsw i64 %idx, 1
%cmp3 = icmp eq i64 %idx.next, %n
br i1 %cmp3, label %exit, label %for.body

exit:
ret void
}

declare dso_local void @otherfn1()
declare dso_local void @otherfn2()