diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 631cc26d6022f..790de0b822f66 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1982,17 +1982,36 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI, return true; } -static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, - const LoopInfo *LI, - Value *&RemAmtOut, - PHINode *&LoopIncrPNOut) { +static bool isRemOfLoopIncrementWithLoopInvariant( + Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut, + Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) { Value *Incr, *RemAmt; // NB: If RemAmt is a power of 2 it *should* have been transformed by now. if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt)))) return false; + Value *AddInst, *AddOffset; // Find out loop increment PHI. auto *PN = dyn_cast(Incr); + if (PN != nullptr) { + AddInst = nullptr; + AddOffset = nullptr; + } else { + // Search through a NUW add on top of the loop increment. + Value *V0, *V1; + if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1)))) + return false; + + AddInst = Incr; + PN = dyn_cast(V0); + if (PN != nullptr) { + AddOffset = V1; + } else { + PN = dyn_cast(V1); + AddOffset = V0; + } + } + if (!PN) return false; @@ -2032,6 +2051,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, // Set output variables. RemAmtOut = RemAmt; LoopIncrPNOut = PN; + AddInstOut = AddInst; + AddOffsetOut = AddOffset; return true; } @@ -2046,15 +2067,14 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, // Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant; // for(i = Start; i < End; ++i, ++rem) // Rem = rem == RemAmtLoopInvariant ? 0 : Rem; -// -// Currently only implemented for `IncrLoopInvariant` being zero. static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, const LoopInfo *LI, SmallSet &FreshBBs, bool IsHuge) { - Value *RemAmt; + Value *AddOffset, *RemAmt, *AddInst; PHINode *LoopIncrPN; - if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN)) + if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddInst, + AddOffset, LoopIncrPN)) return false; // Only non-constant remainder as the extra IV is probably not profitable @@ -2072,6 +2092,23 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, Loop *L = LI->getLoopFor(LoopIncrPN->getParent()); Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()); + // If we have add create initial value for remainder. + // The logic here is: + // (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant + // + // Only proceed if the expression simplifies (otherwise we can't fully + // optimize out the urem). + if (AddInst) { + assert(AddOffset && "We found an add but missing values"); + // Without dom-condition/assumption cache we aren't likely to get much out + // of a context instruction. + Start = simplifyAddInst(Start, AddOffset, + match(AddInst, m_NSWAdd(m_Value(), m_Value())), + /*IsNUW=*/true, *DL); + if (!Start) + return false; + } + // If we can't fully optimize out the `rem`, skip this transform. Start = simplifyURemInst(Start, RemAmt, *DL); if (!Start) @@ -2099,9 +2136,12 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, FreshBBs.insert(LoopIncrPN->getParent()); FreshBBs.insert(L->getLoopLatch()); FreshBBs.insert(Rem->getParent()); - + if (AddInst) + FreshBBs.insert(cast(AddInst)->getParent()); replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge); Rem->eraseFromParent(); + if (AddInst && AddInst->use_empty()) + cast(AddInst)->eraseFromParent(); return true; } diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll index abfbf2e5e582e..33d18d0e2a795 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll @@ -319,20 +319,20 @@ for.body.tail: define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind { ; CHECK-LABEL: define void @simple_urem_to_sel_vec( ; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[FOR_COND_CLEANUP:.*]]: +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[ENTRY:.*]]: +; CHECK: [[FOR_COND_CLEANUP:.*]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ] +; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ] ; CHECK-NEXT: tail call void @use.2xi64(<2 x i64> [[REM]]) ; CHECK-NEXT: [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]] ; CHECK-NEXT: [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]] ; CHECK-NEXT: [[INC]] = add nuw <2 x i64> [[I_04]], ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = call i1 @get.i1() -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] ; entry: br label %for.body @@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5 -; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]] ; CHECK-NEXT: tail call void @use.i32(i32 [[REM]]) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]] +; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]