Skip to content

Commit 6a75337

Browse files
committed
[VPlan] Directly unroll VectorPointerRecipe
1 parent e3de8ff commit 6a75337

File tree

56 files changed

+903
-1126
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+903
-1126
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7607,7 +7607,10 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
76077607
Ptr, &Plan.getVF(), getLoadStoreType(I),
76087608
/*Stride*/ -1, Flags, VPI->getDebugLoc());
76097609
} else {
7610-
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7610+
const DataLayout &DL = I->getDataLayout();
7611+
VPValue *Offset = Plan.getConstantInt(
7612+
DL.getIndexType(Ptr->getUnderlyingValue()->getType()), 0);
7613+
VectorPtr = new VPVectorPointerRecipe(Ptr, Offset, getLoadStoreType(I),
76117614
GEP ? GEP->getNoWrapFlags()
76127615
: GEPNoWrapFlags::none(),
76137616
VPI->getDebugLoc());

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,20 +1964,22 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
19641964
#endif
19651965
};
19661966

1967-
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
1968-
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
1969-
public VPUnrollPartAccessor<1> {
1967+
/// A recipe to compute the pointers for widened memory accesses of
1968+
/// SourceElementTy.
1969+
class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
19701970
Type *SourceElementTy;
19711971

19721972
public:
1973-
VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy,
1973+
VPVectorPointerRecipe(VPValue *Ptr, VPValue *Offset, Type *SourceElementTy,
19741974
GEPNoWrapFlags GEPFlags, DebugLoc DL)
1975-
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1976-
GEPFlags, DL),
1975+
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, {Ptr, Offset}, GEPFlags,
1976+
DL),
19771977
SourceElementTy(SourceElementTy) {}
19781978

19791979
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
19801980

1981+
VPValue *getOffset() { return getOperand(1); }
1982+
19811983
void execute(VPTransformState &State) override;
19821984

19831985
Type *getSourceElementType() const { return SourceElementTy; }
@@ -1997,14 +1999,11 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
19971999
}
19982000

19992001
VPVectorPointerRecipe *clone() override {
2000-
return new VPVectorPointerRecipe(getOperand(0), SourceElementTy,
2001-
getGEPNoWrapFlags(), getDebugLoc());
2002+
return new VPVectorPointerRecipe(getOperand(0), getOffset(),
2003+
SourceElementTy, getGEPNoWrapFlags(),
2004+
getDebugLoc());
20022005
}
20032006

2004-
/// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
2005-
/// this is only accurate after the VPlan has been unrolled.
2006-
bool isFirstPart() const { return getUnrollPart(*this) == 0; }
2007-
20082007
/// Return the cost of this VPHeaderPHIRecipe.
20092008
InstructionCost computeCost(ElementCount VF,
20102009
VPCostContext &Ctx) const override {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2632,15 +2632,13 @@ void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
26322632

26332633
void VPVectorPointerRecipe::execute(VPTransformState &State) {
26342634
auto &Builder = State.Builder;
2635-
unsigned CurrentPart = getUnrollPart(*this);
2636-
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2637-
Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
26382635
Value *Ptr = State.get(getOperand(0), VPLane(0));
2639-
2640-
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2641-
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2642-
"", getGEPNoWrapFlags());
2643-
2636+
Value *Step = State.get(getOffset(), true);
2637+
if (auto *C = dyn_cast<ConstantInt>(Step))
2638+
if (C->isZero())
2639+
return State.set(this, Ptr, /*IsScalar=*/true);
2640+
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Step, "",
2641+
getGEPNoWrapFlags());
26442642
State.set(this, ResultPtr, /*IsScalar*/ true);
26452643
}
26462644

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,14 +1381,6 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
13811381
}
13821382
}
13831383

1384-
// VPVectorPointer for part 0 can be replaced by their start pointer.
1385-
if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1386-
if (VecPtr->isFirstPart()) {
1387-
VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1388-
return;
1389-
}
1390-
}
1391-
13921384
// VPScalarIVSteps for part 0 can be replaced by their start value, if only
13931385
// the first lane is demanded.
13941386
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,22 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
298298
Copy->setOperand(1, getValueForPart(Op, Part));
299299
continue;
300300
}
301+
if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
302+
VPBuilder Builder(VPR);
303+
auto *Prev = cast<VPVectorPointerRecipe>(getValueForPart(VPR, Part - 1))
304+
->getOperand(1);
305+
VPValue *Increment = &Plan.getVF();
306+
Type *IncTy = TypeInfo.inferScalarType(Increment);
307+
Increment = Builder.createScalarZExtOrTrunc(
308+
Increment, TypeInfo.inferScalarType(Prev), IncTy,
309+
DebugLoc::getCompilerGenerated());
310+
VPIRFlags Flags = VPIRFlags::WrapFlagsTy(true, true);
311+
VPInstruction *Add = Builder.createNaryOp(
312+
Instruction::Add, {Prev, Increment}, Flags, VPR->getDebugLoc());
313+
Copy->setOperand(0, VPR->getOperand(0));
314+
Copy->setOperand(1, Add);
315+
continue;
316+
}
301317
if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
302318
auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
303319
if (Phi && Phi->isOrdered()) {
@@ -315,12 +331,12 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
315331
// Add operand indicating the part to generate code for, to recipes still
316332
// requiring it.
317333
if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
318-
VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
334+
VPVectorEndPointerRecipe>(Copy) ||
319335
match(Copy,
320336
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
321337
Copy->addOperand(getConstantInt(Part));
322338

323-
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
339+
if (isa<VPVectorEndPointerRecipe>(R))
324340
Copy->setOperand(0, R.getOperand(0));
325341
}
326342
}

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
2121
; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
2222
; CHECK: [[VECTOR_PH]]:
2323
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
24-
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
24+
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP8]], 2
25+
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP11]], 2
2526
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]]
2627
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2728
; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
@@ -36,9 +37,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
3637
; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP26]]
3738
; CHECK-NEXT: [[TMP32:%.*]] = sext i32 [[TMP30]] to i64
3839
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
39-
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
40-
; CHECK-NEXT: [[TMP38:%.*]] = shl nuw i64 [[TMP37]], 1
41-
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP34]], i64 [[TMP38]]
40+
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP34]], i64 [[TMP11]]
4241
; CHECK-NEXT: store <vscale x 2 x double> zeroinitializer, ptr [[TMP34]], align 8
4342
; CHECK-NEXT: store <vscale x 2 x double> zeroinitializer, ptr [[TMP39]], align 8
4443
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
3030
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
3131
; DEFAULT: [[VECTOR_PH]]:
3232
; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
33-
; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
33+
; DEFAULT-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP9]], 8
34+
; DEFAULT-NEXT: [[TMP10:%.*]] = mul i64 [[TMP13]], 2
3435
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
3536
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
3637
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
@@ -40,9 +41,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4041
; DEFAULT: [[VECTOR_BODY]]:
4142
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
4243
; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
43-
; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
44-
; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 3
45-
; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP14]]
44+
; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP13]]
4645
; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
4746
; DEFAULT-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
4847
; DEFAULT-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
@@ -56,9 +55,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
5655
; DEFAULT-NEXT: [[TMP24:%.*]] = trunc <vscale x 8 x i16> [[TMP22]] to <vscale x 8 x i8>
5756
; DEFAULT-NEXT: [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP23]] to <vscale x 8 x i8>
5857
; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
59-
; DEFAULT-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
60-
; DEFAULT-NEXT: [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 3
61-
; DEFAULT-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP26]], i64 [[TMP28]]
58+
; DEFAULT-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP26]], i64 [[TMP13]]
6259
; DEFAULT-NEXT: store <vscale x 8 x i8> [[TMP24]], ptr [[TMP26]], align 1
6360
; DEFAULT-NEXT: store <vscale x 8 x i8> [[TMP25]], ptr [[TMP29]], align 1
6461
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,8 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
144144
; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
145145
; INTERLEAVE-4-VLA: vector.ph:
146146
; INTERLEAVE-4-VLA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
147-
; INTERLEAVE-4-VLA-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
147+
; INTERLEAVE-4-VLA-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP2]], 4
148+
; INTERLEAVE-4-VLA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP5]], 4
148149
; INTERLEAVE-4-VLA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
149150
; INTERLEAVE-4-VLA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
150151
; INTERLEAVE-4-VLA-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -155,14 +156,10 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
155156
; INTERLEAVE-4-VLA-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
156157
; INTERLEAVE-4-VLA-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
157158
; INTERLEAVE-4-VLA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
158-
; INTERLEAVE-4-VLA-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
159-
; INTERLEAVE-4-VLA-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
160-
; INTERLEAVE-4-VLA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP6]]
161-
; INTERLEAVE-4-VLA-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
162-
; INTERLEAVE-4-VLA-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
159+
; INTERLEAVE-4-VLA-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], [[TMP5]]
160+
; INTERLEAVE-4-VLA-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP5]]
161+
; INTERLEAVE-4-VLA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP5]]
163162
; INTERLEAVE-4-VLA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP9]]
164-
; INTERLEAVE-4-VLA-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
165-
; INTERLEAVE-4-VLA-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 12
166163
; INTERLEAVE-4-VLA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP12]]
167164
; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 1
168165
; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 1

0 commit comments

Comments
 (0)