Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1982,17 +1982,36 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
return true;
}

static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
const LoopInfo *LI,
Value *&RemAmtOut,
PHINode *&LoopIncrPNOut) {
static bool isRemOfLoopIncrementWithLoopInvariant(
Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut,
Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) {
Value *Incr, *RemAmt;
// NB: If RemAmt is a power of 2 it *should* have been transformed by now.
if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
return false;

Value *AddInst, *AddOffset;
// Find out loop increment PHI.
auto *PN = dyn_cast<PHINode>(Incr);
if (PN != nullptr) {
AddInst = nullptr;
AddOffset = nullptr;
} else {
// Search through a NUW add on top of the loop increment.
Value *V0, *V1;
if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1))))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth adding a m_PHI matcher - we could then use m_c_NUWAdd and avoid the messy commutative matching below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could, although Im not sure exactly what we would want in a PHI matcher? Would we support matching BBs? Variable number of arguments or "binary" phis only? Etc...
Either way, I think that its a conversation that is a bit beyond the scope of this particular use case and PR.

return false;

AddInst = Incr;
PN = dyn_cast<PHINode>(V0);
if (PN != nullptr) {
AddOffset = V1;
} else {
PN = dyn_cast<PHINode>(V1);
AddOffset = V0;
}
}

if (!PN)
return false;

Expand Down Expand Up @@ -2032,6 +2051,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
// Set output variables.
RemAmtOut = RemAmt;
LoopIncrPNOut = PN;
AddInstOut = AddInst;
AddOffsetOut = AddOffset;

return true;
}
Expand All @@ -2046,15 +2067,14 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
// for(i = Start; i < End; ++i, ++rem)
// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
//
// Currently only implemented for `IncrLoopInvariant` being zero.
static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
const LoopInfo *LI,
SmallSet<BasicBlock *, 32> &FreshBBs,
bool IsHuge) {
Value *RemAmt;
Value *AddOffset, *RemAmt, *AddInst;
PHINode *LoopIncrPN;
if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddInst,
AddOffset, LoopIncrPN))
return false;

// Only non-constant remainder as the extra IV is probably not profitable
Expand All @@ -2072,6 +2092,23 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,

Loop *L = LI->getLoopFor(LoopIncrPN->getParent());
Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
// If we have add create initial value for remainder.
// The logic here is:
// (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
//
// Only proceed if the expression simplifies (otherwise we can't fully
// optimize out the urem).
if (AddInst) {
assert(AddOffset && "We found an add but missing values");
// Without dom-condition/assumption cache we aren't likely to get much out
// of a context instruction.
Start = simplifyAddInst(Start, AddOffset,
match(AddInst, m_NSWAdd(m_Value(), m_Value())),
/*IsNUW=*/true, *DL);
if (!Start)
return false;
}

// If we can't fully optimize out the `rem`, skip this transform.
Start = simplifyURemInst(Start, RemAmt, *DL);
if (!Start)
Expand Down Expand Up @@ -2099,9 +2136,12 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
FreshBBs.insert(LoopIncrPN->getParent());
FreshBBs.insert(L->getLoopLatch());
FreshBBs.insert(Rem->getParent());

if (AddInst)
FreshBBs.insert(cast<Instruction>(AddInst)->getParent());
replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
Rem->eraseFromParent();
if (AddInst && AddInst->use_empty())
cast<Instruction>(AddInst)->eraseFromParent();
return true;
}

Expand Down
16 changes: 9 additions & 7 deletions llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -319,20 +319,20 @@ for.body.tail:
define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
; CHECK-LABEL: define void @simple_urem_to_sel_vec(
; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[FOR_COND_CLEANUP:.*]]:
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[ENTRY:.*]]:
; CHECK: [[FOR_COND_CLEANUP:.*]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ]
; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: tail call void @use.2xi64(<2 x i64> [[REM]])
; CHECK-NEXT: [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], <i64 1, i64 1>
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]]
; CHECK-NEXT: [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]]
; CHECK-NEXT: [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
;
entry:
br label %for.body
Expand Down Expand Up @@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
Expand Down