Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,7 @@ struct LSRFixup {
LSRFixup() = default;

bool isUseFullyOutsideLoop(const Loop *L) const;
bool isUseUnconditional(const Loop *L) const;

void print(raw_ostream &OS) const;
void dump() const;
Expand Down Expand Up @@ -1318,6 +1319,11 @@ class LSRUse {
/// the loop, in which case some special-case heuristics may be used.
bool AllFixupsOutsideLoop = true;

/// This records whether all of the fixups using this LSRUse are unconditional
/// within the loop, meaning they will be executed in every iteration of the
/// loop.
bool AllFixupsUnconditional = true;

/// RigidFormula is set to true to guarantee that this use will be associated
/// with a single formula--the one that initially matched. Some SCEV
/// expressions cannot be expanded. This allows LSR to consider the registers
Expand Down Expand Up @@ -1422,15 +1428,19 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
SE->isLoopInvariant(Start, L)))
bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue();
bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
SE->isLoopInvariant(Start, L);
// We can only pre or post index when the load/store is unconditional.
if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
LoopCost = 0;
}
}

// If the loop counts down to zero and we'll be using a hardware loop then
// the addrec will be combined into the hardware loop instruction.
if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
Expand Down Expand Up @@ -1647,6 +1657,12 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
return !L->contains(UserInst);
}

/// Test whether this fixup is for an instruction that's unconditional, i.e.
/// it's executed in every loop iteration.
bool LSRFixup::isUseUnconditional(const Loop *L) const {
return isGuaranteedToExecuteForEveryIteration(UserInst, L);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm... what kind of "unconditional" execution do you actually need in this context? There's a hierarchy here:

  1. Not inside interior control flow in the loop (your test case).
  2. Plus no early exits.
  3. Plus no abnormal exits.

I think that at least you don't care about abnormal exits in this context? If there is an instruction that can throw/diverge in the loop, that shouldn't interfere with indexed addressing, right? In that case using isGuaranteedToExecuteForEveryIteration() is both unnecessarily conservative and unnecessarily expensive, as it has to scan for abnormal exits.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we ultimately want to check for is situations where, when we reach codegen, the load/store and the add will be combined into a single pre/post increment load/store. For this to happen during ISel the load/store and the add need to be in the same block. Looking into this a bit, LSRInstance::ImplementSolution is where we decide the InsertPos, but this currently only allows the loop header and the IVIncInsertPos, which will be the loop latch in most cases.

I think that it should be possible to change this to insert the add instruction in the LSRUse block if it dominates IVIncInsertPos, as when that's the case we must go through the LSRUse block to reach IVIncInsertPos, so an instruction inserted there is guaranteed to be available at IVIncInsertPos. I've changed this patch to check for that, and I'll be doing a follow-on patch to change LSRInstance::ImplementSolution to hoist to this location.

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRFixup::print(raw_ostream &OS) const {
OS << "UserInst=";
Expand Down Expand Up @@ -1783,6 +1799,9 @@ void LSRUse::print(raw_ostream &OS) const {
if (AllFixupsOutsideLoop)
OS << ", all-fixups-outside-loop";

if (AllFixupsUnconditional)
OS << ", all-fixups-unconditional";

if (WidestFixupType)
OS << ", widest fixup type: " << *WidestFixupType;
}
Expand Down Expand Up @@ -3607,6 +3626,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);

// Create SCEV as Formula for calculating baseline cost
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
Expand Down Expand Up @@ -3803,6 +3823,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
LF.OperandValToReplace = U;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
Expand Down Expand Up @@ -4940,6 +4961,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');

LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;

// Transfer the fixups of LU to LUThatHas.
for (LSRFixup &Fixup : LU.Fixups) {
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
Original file line number Diff line number Diff line change
Expand Up @@ -119,31 +119,29 @@ for.end:
; We can't use postindex addressing on the conditional load of qval and can't
; convert the loop condition to a compare with zero, so we should instead use
; offset addressing.
; FIXME: Currently we don't notice the load of qval is conditional, and attempt
; postindex addressing anyway.
define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-LABEL: define i32 @conditional_load(
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[PVAL]], 0
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
; CHECK: [[IF_THEN]]:
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 2
; CHECK-NEXT: [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]]
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
; CHECK-NEXT: br label %[[FOR_INC]]
; CHECK: [[FOR_INC]]:
; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
Expand Down
Loading