Skip to content

Commit ef023ca

Browse files
authored
Reland [VPlan] Expand WidenInt inductions with nuw/nsw (llvm#168354)
Changes: The previous patch had to be reverted to a mismatching-OpType assert in cse. The reduced-test has now been added corresponding to a RVV pointer-induction, and the pointer-induction case has been updated to use createOverflowingBinaryOp. While at it, record VPIRFlags in VPWidenInductionRecipe.
1 parent e468ea3 commit ef023ca

File tree

121 files changed

+934
-800
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+934
-800
lines changed

flang/test/Integration/unroll-loops.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ subroutine unroll(a)
2525
! NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
2626
! NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
2727
! NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
28-
! NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
28+
! NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
2929
!
3030
! UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
3131
! UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]

flang/test/Lower/HLFIR/unroll-loops.fir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a
2727
// NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
2828
// NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
2929
// NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
30-
// NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
30+
// NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
3131

3232
// UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
3333
// UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,10 @@ class VPBuilder {
178178
new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
179179
}
180180

181-
VPInstruction *createOverflowingOp(unsigned Opcode,
182-
ArrayRef<VPValue *> Operands,
183-
VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
184-
DebugLoc DL = DebugLoc::getUnknown(),
185-
const Twine &Name = "") {
181+
VPInstruction *createOverflowingOp(
182+
unsigned Opcode, ArrayRef<VPValue *> Operands,
183+
VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false},
184+
DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") {
186185
return tryInsertInstruction(
187186
new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
188187
}

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7639,6 +7639,10 @@ createWidenInductionRecipes(VPInstruction *PhiR,
76397639
assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
76407640
"Start VPValue must match IndDesc's start value");
76417641

7642+
// It is always safe to copy over the NoWrap and FastMath flags. In
7643+
// particular, when folding tail by masking, the masked-off lanes are never
7644+
// used, so it is safe.
7645+
VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
76427646
VPValue *Step =
76437647
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
76447648

@@ -7651,7 +7655,7 @@ createWidenInductionRecipes(VPInstruction *PhiR,
76517655

76527656
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
76537657
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7654-
IndDesc, PhiR->getDebugLoc());
7658+
IndDesc, Flags, PhiR->getDebugLoc());
76557659
}
76567660

76577661
VPHeaderPHIRecipe *
@@ -7705,10 +7709,15 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
77057709
PHINode *Phi = WidenIV->getPHINode();
77067710
VPValue *Start = WidenIV->getStartValue();
77077711
const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7712+
7713+
// It is always safe to copy over the NoWrap and FastMath flags. In
7714+
// particular, when folding tail by masking, the masked-off lanes are never
7715+
// used, so it is safe.
7716+
VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
77087717
VPValue *Step =
77097718
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
7710-
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7711-
IndDesc, I, VPI->getDebugLoc());
7719+
return new VPWidenIntOrFpInductionRecipe(
7720+
Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
77127721
}
77137722

77147723
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2151,7 +2151,8 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
21512151
/// A recipe for handling phi nodes of integer and floating-point inductions,
21522152
/// producing their vector values. This is an abstract recipe and must be
21532153
/// converted to concrete recipes before executing.
2154-
class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
2154+
class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe,
2155+
public VPIRFlags {
21552156
TruncInst *Trunc;
21562157

21572158
// If this recipe is unrolled it will have 2 additional operands.
@@ -2160,19 +2161,20 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
21602161
public:
21612162
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
21622163
VPValue *VF, const InductionDescriptor &IndDesc,
2163-
DebugLoc DL)
2164+
const VPIRFlags &Flags, DebugLoc DL)
21642165
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
21652166
Step, IndDesc, DL),
2166-
Trunc(nullptr) {
2167+
VPIRFlags(Flags), Trunc(nullptr) {
21672168
addOperand(VF);
21682169
}
21692170

21702171
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
21712172
VPValue *VF, const InductionDescriptor &IndDesc,
2172-
TruncInst *Trunc, DebugLoc DL)
2173+
TruncInst *Trunc, const VPIRFlags &Flags,
2174+
DebugLoc DL)
21732175
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
21742176
Step, IndDesc, DL),
2175-
Trunc(Trunc) {
2177+
VPIRFlags(Flags), Trunc(Trunc) {
21762178
addOperand(VF);
21772179
SmallVector<std::pair<unsigned, MDNode *>> Metadata;
21782180
(void)Metadata;
@@ -2186,7 +2188,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
21862188
VPWidenIntOrFpInductionRecipe *clone() override {
21872189
return new VPWidenIntOrFpInductionRecipe(
21882190
getPHINode(), getStartValue(), getStepValue(), getVFValue(),
2189-
getInductionDescriptor(), Trunc, getDebugLoc());
2191+
getInductionDescriptor(), Trunc, *this, getDebugLoc());
21902192
}
21912193

21922194
VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2387,7 +2387,9 @@ void VPWidenIntOrFpInductionRecipe::printRecipe(
23872387
raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
23882388
O << Indent;
23892389
printAsOperand(O, SlotTracker);
2390-
O << " = WIDEN-INDUCTION ";
2390+
O << " = WIDEN-INDUCTION";
2391+
printFlags(O);
2392+
O << " ";
23912393
printOperands(O, SlotTracker);
23922394

23932395
if (auto *TI = getTruncInst())

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
7676
VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
7777
VPValue *Step =
7878
vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
79+
// It is always safe to copy over the NoWrap and FastMath flags. In
80+
// particular, when folding tail by masking, the masked-off lanes are
81+
// never used, so it is safe.
82+
VPIRFlags Flags = vputils::getFlagsFromIndDesc(*II);
7983
NewRecipe = new VPWidenIntOrFpInductionRecipe(
80-
Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
84+
Phi, Start, Step, &Plan.getVF(), *II, Flags,
85+
Ingredient.getDebugLoc());
8186
}
8287
} else {
8388
assert(isa<VPInstruction>(&Ingredient) &&
@@ -542,6 +547,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
542547
// only.
543548
if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
544549
vputils::onlyFirstLaneUsed(WidenNewIV)) {
550+
// We are replacing a wide canonical iv with a suitable wide induction.
551+
// This is used to compute header mask, hence all lanes will be used and
552+
// we need to drop wrap flags only applying to lanes guranteed to execute
553+
// in the original scalar loop.
554+
WidenOriginalIV->dropPoisonGeneratingFlags();
545555
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
546556
WidenNewIV->eraseFromParent();
547557
return;
@@ -3285,16 +3295,13 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
32853295
const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
32863296
Instruction::BinaryOps AddOp;
32873297
Instruction::BinaryOps MulOp;
3288-
// FIXME: The newly created binary instructions should contain nsw/nuw
3289-
// flags, which can be found from the original scalar operations.
3290-
VPIRFlags Flags;
3298+
VPIRFlags Flags = *WidenIVR;
32913299
if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
32923300
AddOp = Instruction::Add;
32933301
MulOp = Instruction::Mul;
32943302
} else {
32953303
AddOp = ID.getInductionOpcode();
32963304
MulOp = Instruction::FMul;
3297-
Flags = ID.getInductionBinOp()->getFastMathFlags();
32983305
}
32993306

33003307
// If the phi is truncated, truncate the start and step values.
@@ -3406,7 +3413,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
34063413
Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
34073414
Type *StepTy = TypeInfo.inferScalarType(Step);
34083415
VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3409-
Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3416+
Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
34103417
VPValue *PtrAdd = Builder.createNaryOp(
34113418
VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
34123419
R->replaceAllUsesWith(PtrAdd);
@@ -3416,7 +3423,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
34163423
Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
34173424
VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
34183425
DL);
3419-
VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3426+
VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
34203427

34213428
VPValue *InductionGEP =
34223429
Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,19 @@ std::optional<VPValue *>
7373
getRecipesForUncountableExit(VPlan &Plan,
7474
SmallVectorImpl<VPRecipeBase *> &Recipes,
7575
SmallVectorImpl<VPRecipeBase *> &GEPs);
76+
77+
/// Extracts and returns NoWrap and FastMath flags from the induction binop in
78+
/// \p ID.
79+
inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) {
80+
if (ID.getKind() == InductionDescriptor::IK_FpInduction)
81+
return ID.getInductionBinOp()->getFastMathFlags();
82+
83+
if (auto *OBO = dyn_cast_if_present<OverflowingBinaryOperator>(
84+
ID.getInductionBinOp()))
85+
return VPIRFlags::WrapFlagsTy(OBO->hasNoUnsignedWrap(),
86+
OBO->hasNoSignedWrap());
87+
return {};
88+
}
7689
} // namespace vputils
7790

7891
//===----------------------------------------------------------------------===//

llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
1414
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
1515
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
1616
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
17-
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
18-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
17+
; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
18+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
1919
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
2020
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
2121
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -76,8 +76,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
7676
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
7777
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
7878
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
79-
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
80-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
79+
; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
80+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
8181
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
8282
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
8383
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1052,7 +1052,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
10521052
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
10531053
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
10541054
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
1055-
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
1055+
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
10561056
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
10571057
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
10581058
; DEFAULT: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)