Skip to content

Commit b8eaceb

Browse files
authored
[VPlan] Explicitly replicate VPInstructions by VF. (#155102)
Extend replicateByVF added in #142433 (aa24029) to also explicitly unroll replicating VPInstructions. Now the only remaining case where we replicate for all lanes is VPReplicateRecipes in replicate regions. PR: #155102
1 parent ed1f1b8 commit b8eaceb

File tree

9 files changed

+119
-119
lines changed

9 files changed

+119
-119
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -343,37 +343,21 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
343343
LastLane = 0;
344344
}
345345

346-
auto *LastInst = cast<Instruction>(get(Def, LastLane));
346+
// We need to construct the vector value for a single-scalar value by
347+
// broadcasting the scalar to all lanes.
348+
// TODO: Replace by introducing Broadcast VPInstructions.
349+
assert(IsSingleScalar && "must be a single-scalar at this point");
347350
// Set the insert point after the last scalarized instruction or after the
348351
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
349352
// will directly follow the scalar definitions.
350353
auto OldIP = Builder.saveIP();
354+
auto *LastInst = cast<Instruction>(get(Def, LastLane));
351355
auto NewIP = isa<PHINode>(LastInst)
352356
? LastInst->getParent()->getFirstNonPHIIt()
353357
: std::next(BasicBlock::iterator(LastInst));
354358
Builder.SetInsertPoint(&*NewIP);
355-
356-
// However, if we are vectorizing, we need to construct the vector values.
357-
// If the value is known to be uniform after vectorization, we can just
358-
// broadcast the scalar value corresponding to lane zero. Otherwise, we
359-
// construct the vector values using insertelement instructions. Since the
360-
// resulting vectors are stored in State, we will only generate the
361-
// insertelements once.
362-
Value *VectorValue = nullptr;
363-
if (IsSingleScalar) {
364-
VectorValue = GetBroadcastInstrs(ScalarValue);
365-
set(Def, VectorValue);
366-
} else {
367-
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
368-
assert(isa<VPInstruction>(Def) &&
369-
"Explicit BuildVector recipes must have"
370-
"handled packing for non-VPInstructions.");
371-
// Initialize packing with insertelements to start from poison.
372-
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
373-
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
374-
VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
375-
set(Def, VectorValue);
376-
}
359+
Value *VectorValue = GetBroadcastInstrs(ScalarValue);
360+
set(Def, VectorValue);
377361
Builder.restoreIP(OldIP);
378362
return VectorValue;
379363
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
908908
return R && classof(R);
909909
}
910910

911+
virtual VPRecipeWithIRFlags *clone() override = 0;
912+
911913
static inline bool classof(const VPSingleDefRecipe *U) {
912914
auto *R = dyn_cast<VPRecipeBase>(U);
913915
return R && classof(R);
@@ -1061,13 +1063,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10611063
VScale,
10621064
};
10631065

1064-
private:
1065-
typedef unsigned char OpcodeTy;
1066-
OpcodeTy Opcode;
1067-
1068-
/// An optional name that can be used for the generated IR instruction.
1069-
const std::string Name;
1070-
10711066
/// Returns true if this VPInstruction generates scalar values for all lanes.
10721067
/// Most VPInstructions generate a single value per part, either vector or
10731068
/// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
@@ -1076,6 +1071,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10761071
/// underlying ingredient.
10771072
bool doesGeneratePerAllLanes() const;
10781073

1074+
private:
1075+
typedef unsigned char OpcodeTy;
1076+
OpcodeTy Opcode;
1077+
1078+
/// An optional name that can be used for the generated IR instruction.
1079+
const std::string Name;
1080+
10791081
/// Returns true if we can generate a scalar for the first lane only if
10801082
/// needed.
10811083
bool canGenerateScalarForFirstLane() const;
@@ -1085,11 +1087,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10851087
/// existing value is returned rather than a generated one.
10861088
Value *generate(VPTransformState &State);
10871089

1088-
/// Utility methods serving execute(): generates a scalar single instance of
1089-
/// the modeled instruction for a given lane. \returns the scalar generated
1090-
/// value for lane \p Lane.
1091-
Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
1092-
10931090
#if !defined(NDEBUG)
10941091
/// Return the number of operands determined by the opcode of the
10951092
/// VPInstruction. Returns -1u if the number of operands cannot be determined

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -564,16 +564,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
564564
}
565565
}
566566

567-
Value *VPInstruction::generatePerLane(VPTransformState &State,
568-
const VPLane &Lane) {
569-
IRBuilderBase &Builder = State.Builder;
570-
571-
assert(getOpcode() == VPInstruction::PtrAdd &&
572-
"only PtrAdd opcodes are supported for now");
573-
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
574-
State.get(getOperand(1), Lane), Name);
575-
}
576-
577567
/// Create a conditional branch using \p Cond branching to the successors of \p
578568
/// VPBB. Note that the first successor is always forward (i.e. not created yet)
579569
/// while the second successor may already have been created (if it is a header
@@ -1197,24 +1187,13 @@ void VPInstruction::execute(VPTransformState &State) {
11971187
"Set flags not supported for the provided opcode");
11981188
if (hasFastMathFlags())
11991189
State.Builder.setFastMathFlags(getFastMathFlags());
1200-
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1201-
(vputils::onlyFirstLaneUsed(this) ||
1202-
isVectorToScalar() || isSingleScalar());
1203-
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1204-
if (GeneratesPerAllLanes) {
1205-
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1206-
Lane != NumLanes; ++Lane) {
1207-
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1208-
assert(GeneratedValue && "generatePerLane must produce a value");
1209-
State.set(this, GeneratedValue, VPLane(Lane));
1210-
}
1211-
return;
1212-
}
1213-
12141190
Value *GeneratedValue = generate(State);
12151191
if (!hasResult())
12161192
return;
12171193
assert(GeneratedValue && "generate must produce a value");
1194+
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1195+
(vputils::onlyFirstLaneUsed(this) ||
1196+
isVectorToScalar() || isSingleScalar());
12181197
assert((((GeneratedValue->getType()->isVectorTy() ||
12191198
GeneratedValue->getType()->isStructTy()) ==
12201199
!GeneratesPerFirstLaneOnly) ||
@@ -1287,6 +1266,12 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
12871266
case VPInstruction::Broadcast:
12881267
case VPInstruction::ReductionStartVector:
12891268
return true;
1269+
case VPInstruction::BuildStructVector:
1270+
case VPInstruction::BuildVector:
1271+
// Before replicating by VF, Build(Struct)Vector uses all lanes of the
1272+
// operand, after replicating its operands only the first lane is used.
1273+
// Before replicating, it will have only a single operand.
1274+
return getNumOperands() > 1;
12901275
case VPInstruction::PtrAdd:
12911276
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
12921277
case VPInstruction::WidePtrAdd:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3695,34 +3695,39 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
36953695
vp_depth_first_shallow(Plan.getEntry()));
36963696
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
36973697
vp_depth_first_shallow(LoopRegion->getEntry()));
3698-
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3699-
// excluding ones in replicate regions. Those are not materialized explicitly
3700-
// yet. Those vector users are still handled in VPReplicateRegion::execute(),
3701-
// via shouldPack().
3698+
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3699+
// VPInstructions, excluding ones in replicate regions. Those are not
3700+
// materialized explicitly yet. Those vector users are still handled in
3701+
// VPReplicateRegion::execute(), via shouldPack().
37023702
// TODO: materialize build vectors for replicating recipes in replicating
37033703
// regions.
3704-
// TODO: materialize build vectors for VPInstructions.
37053704
for (VPBasicBlock *VPBB :
37063705
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
37073706
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3708-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3709-
auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3707+
if (!isa<VPReplicateRecipe, VPInstruction>(&R))
3708+
continue;
3709+
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3710+
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
37103711
VPRegionBlock *ParentRegion =
37113712
cast<VPRecipeBase>(U)->getParent()->getParent();
3712-
return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3713+
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
37133714
};
3714-
if (!RepR || RepR->isSingleScalar() ||
3715-
none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3715+
if ((isa<VPReplicateRecipe>(DefR) &&
3716+
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3717+
(isa<VPInstruction>(DefR) &&
3718+
(vputils::onlyFirstLaneUsed(DefR) ||
3719+
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3720+
none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
37163721
continue;
37173722

3718-
Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3723+
Type *ScalarTy = TypeInfo.inferScalarType(DefR);
37193724
unsigned Opcode = ScalarTy->isStructTy()
37203725
? VPInstruction::BuildStructVector
37213726
: VPInstruction::BuildVector;
3722-
auto *BuildVector = new VPInstruction(Opcode, {RepR});
3723-
BuildVector->insertAfter(RepR);
3727+
auto *BuildVector = new VPInstruction(Opcode, {DefR});
3728+
BuildVector->insertAfter(DefR);
37243729

3725-
RepR->replaceUsesWithIf(
3730+
DefR->replaceUsesWithIf(
37263731
BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
37273732
VPUser &U, unsigned) {
37283733
return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,10 @@ struct VPlanTransforms {
158158
/// Explicitly unroll \p Plan by \p UF.
159159
static void unrollByUF(VPlan &Plan, unsigned UF);
160160

161-
/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
162-
/// with \p VF single-scalar recipes.
163-
/// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
164-
/// dissolving the latter.
161+
/// Replace each replicating VPReplicateRecipe and VPInstruction outside of
162+
/// any replicate region in \p Plan with \p VF single-scalar recipes.
163+
/// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
164+
/// replicate regions, thereby dissolving the latter.
165165
static void replicateByVF(VPlan &Plan, ElementCount VF);
166166

167167
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -463,15 +463,16 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
463463
VPlanTransforms::removeDeadRecipes(Plan);
464464
}
465465

466-
/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
467-
/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
468-
static VPReplicateRecipe *
466+
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
467+
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
468+
/// definitions for operands of \DefR.
469+
static VPRecipeWithIRFlags *
469470
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
470-
VPReplicateRecipe *RepR, VPLane Lane,
471+
VPRecipeWithIRFlags *DefR, VPLane Lane,
471472
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
472473
// Collect the operands at Lane, creating extracts as needed.
473474
SmallVector<VPValue *> NewOps;
474-
for (VPValue *Op : RepR->operands()) {
475+
for (VPValue *Op : DefR->operands()) {
475476
// If Op is a definition that has been unrolled, directly use the clone for
476477
// the corresponding lane.
477478
auto LaneDefs = Def2LaneDefs.find(Op);
@@ -501,11 +502,24 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
501502
NewOps.push_back(Ext);
502503
}
503504

504-
auto *New =
505-
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
506-
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
507-
New->transferFlags(*RepR);
508-
New->insertBefore(RepR);
505+
VPRecipeWithIRFlags *New;
506+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
507+
// TODO: have cloning of replicate recipes also provide the desired result
508+
// coupled with setting its operands to NewOps (deriving IsSingleScalar and
509+
// Mask from the operands?)
510+
New =
511+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
512+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
513+
} else {
514+
assert(isa<VPInstruction>(DefR) &&
515+
"DefR must be a VPReplicateRecipe or VPInstruction");
516+
New = DefR->clone();
517+
for (const auto &[Idx, Op] : enumerate(NewOps)) {
518+
New->setOperand(Idx, Op);
519+
}
520+
}
521+
New->transferFlags(*DefR);
522+
New->insertBefore(DefR);
509523
return New;
510524
}
511525

@@ -530,34 +544,38 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
530544
SmallVector<VPRecipeBase *> ToRemove;
531545
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
532546
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
533-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
534-
if (!RepR || RepR->isSingleScalar())
547+
if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
548+
(isa<VPReplicateRecipe>(&R) &&
549+
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
550+
(isa<VPInstruction>(&R) &&
551+
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
535552
continue;
536553

537-
VPBuilder Builder(RepR);
538-
if (RepR->getNumUsers() == 0) {
539-
// Create single-scalar version of RepR for all lanes.
554+
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
555+
VPBuilder Builder(DefR);
556+
if (DefR->getNumUsers() == 0) {
557+
// Create single-scalar version of DefR for all lanes.
540558
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
541-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
542-
RepR->eraseFromParent();
559+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
560+
DefR->eraseFromParent();
543561
continue;
544562
}
545-
/// Create single-scalar version of RepR for all lanes.
563+
/// Create single-scalar version of DefR for all lanes.
546564
SmallVector<VPValue *> LaneDefs;
547565
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
548566
LaneDefs.push_back(
549-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
567+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
550568

551-
Def2LaneDefs[RepR] = LaneDefs;
569+
Def2LaneDefs[DefR] = LaneDefs;
552570
/// Users that only demand the first lane can use the definition for lane
553571
/// 0.
554-
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
555-
return U.onlyFirstLaneUsed(RepR);
572+
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
573+
return U.onlyFirstLaneUsed(DefR);
556574
});
557575

558-
// Update each build vector user that currently has RepR as its only
576+
// Update each build vector user that currently has DefR as its only
559577
// operand, to have all LaneDefs as its operands.
560-
for (VPUser *U : to_vector(RepR->users())) {
578+
for (VPUser *U : to_vector(DefR->users())) {
561579
auto *VPI = dyn_cast<VPInstruction>(U);
562580
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
563581
VPI->getOpcode() != VPInstruction::BuildStructVector))
@@ -569,7 +587,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
569587
for (VPValue *LaneDef : drop_begin(LaneDefs))
570588
VPI->addOperand(LaneDef);
571589
}
572-
ToRemove.push_back(RepR);
590+
ToRemove.push_back(DefR);
573591
}
574592
}
575593
for (auto *R : reverse(ToRemove))

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ define void @a(ptr readnone %b) {
3333
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
3434
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]]
3535
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]]
36+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
37+
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x ptr> [[TMP21]], ptr [[NEXT_GEP2]], i32 1
38+
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x ptr> [[TMP22]], ptr [[NEXT_GEP3]], i32 2
39+
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x ptr> [[TMP23]], ptr [[NEXT_GEP4]], i32 3
3640
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1
3741
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
3842
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
@@ -649,9 +653,6 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
649653
; STRIDED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], [[TMP8]]
650654
; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP1]]
651655
; STRIDED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]]
652-
; STRIDED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP5]]
653-
; STRIDED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP7]]
654-
; STRIDED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP9]]
655656
; STRIDED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
656657
; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
657658
; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8

0 commit comments

Comments
 (0)