Skip to content

Commit b80eee3

Browse files
committed
[VPlan] Explicitly replicate VPInstructions by VF.
Extend replicateByVF added in #142433 (aa24029) to also explicitly unroll replicating VPInstructions. Now the only remaining case where we replicate for all lanes is VPReplicateRecipes in replicate regions.
1 parent b369237 commit b80eee3

File tree

9 files changed

+110
-117
lines changed

9 files changed

+110
-117
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
333333
LastLane = 0;
334334
}
335335

336+
assert(IsSingleScalar && "must be a single-scalar at this point");
337+
// We need to construct the vector value for a single-scalar value by
338+
// broadcasting the scalar to all lanes.
336339
auto *LastInst = cast<Instruction>(get(Def, LastLane));
337340
// Set the insert point after the last scalarized instruction or after the
338341
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
@@ -343,27 +346,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
343346
: std::next(BasicBlock::iterator(LastInst));
344347
Builder.SetInsertPoint(&*NewIP);
345348

346-
// However, if we are vectorizing, we need to construct the vector values.
347-
// If the value is known to be uniform after vectorization, we can just
348-
// broadcast the scalar value corresponding to lane zero. Otherwise, we
349-
// construct the vector values using insertelement instructions. Since the
350-
// resulting vectors are stored in State, we will only generate the
351-
// insertelements once.
352-
Value *VectorValue = nullptr;
353-
if (IsSingleScalar) {
354-
VectorValue = GetBroadcastInstrs(ScalarValue);
355-
set(Def, VectorValue);
356-
} else {
357-
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
358-
assert(isa<VPInstruction>(Def) &&
359-
"Explicit BuildVector recipes must have"
360-
"handled packing for non-VPInstructions.");
361-
// Initialize packing with insertelements to start from poison.
362-
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
363-
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
364-
VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
365-
set(Def, VectorValue);
366-
}
349+
Value *VectorValue = GetBroadcastInstrs(ScalarValue);
350+
set(Def, VectorValue);
367351
Builder.restoreIP(OldIP);
368352
return VectorValue;
369353
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
901901
return R && classof(R);
902902
}
903903

904+
virtual VPRecipeWithIRFlags *clone() override = 0;
905+
904906
void execute(VPTransformState &State) override = 0;
905907

906908
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
@@ -1045,13 +1047,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10451047
VScale,
10461048
};
10471049

1048-
private:
1049-
typedef unsigned char OpcodeTy;
1050-
OpcodeTy Opcode;
1051-
1052-
/// An optional name that can be used for the generated IR instruction.
1053-
const std::string Name;
1054-
10551050
/// Returns true if this VPInstruction generates scalar values for all lanes.
10561051
/// Most VPInstructions generate a single value per part, either vector or
10571052
/// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
@@ -1060,6 +1055,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10601055
/// underlying ingredient.
10611056
bool doesGeneratePerAllLanes() const;
10621057

1058+
private:
1059+
typedef unsigned char OpcodeTy;
1060+
OpcodeTy Opcode;
1061+
1062+
/// An optional name that can be used for the generated IR instruction.
1063+
const std::string Name;
1064+
10631065
/// Returns true if we can generate a scalar for the first lane only if
10641066
/// needed.
10651067
bool canGenerateScalarForFirstLane() const;
@@ -1069,11 +1071,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10691071
/// existing value is returned rather than a generated one.
10701072
Value *generate(VPTransformState &State);
10711073

1072-
/// Utility methods serving execute(): generates a scalar single instance of
1073-
/// the modeled instruction for a given lane. \returns the scalar generated
1074-
/// value for lane \p Lane.
1075-
Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
1076-
10771074
#if !defined(NDEBUG)
10781075
/// Return the number of operands determined by the opcode of the
10791076
/// VPInstruction. Returns -1u if the number of operands cannot be determined

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -525,16 +525,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
525525
}
526526
}
527527

528-
Value *VPInstruction::generatePerLane(VPTransformState &State,
529-
const VPLane &Lane) {
530-
IRBuilderBase &Builder = State.Builder;
531-
532-
assert(getOpcode() == VPInstruction::PtrAdd &&
533-
"only PtrAdd opcodes are supported for now");
534-
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
535-
State.get(getOperand(1), Lane), Name);
536-
}
537-
538528
/// Create a conditional branch using \p Cond branching to the successors of \p
539529
/// VPBB. Note that the first successor is always forward (i.e. not created yet)
540530
/// while the second successor may already have been created (if it is a header
@@ -1154,24 +1144,13 @@ void VPInstruction::execute(VPTransformState &State) {
11541144
"Set flags not supported for the provided opcode");
11551145
if (hasFastMathFlags())
11561146
State.Builder.setFastMathFlags(getFastMathFlags());
1157-
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1158-
(vputils::onlyFirstLaneUsed(this) ||
1159-
isVectorToScalar() || isSingleScalar());
1160-
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1161-
if (GeneratesPerAllLanes) {
1162-
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1163-
Lane != NumLanes; ++Lane) {
1164-
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1165-
assert(GeneratedValue && "generatePerLane must produce a value");
1166-
State.set(this, GeneratedValue, VPLane(Lane));
1167-
}
1168-
return;
1169-
}
1170-
11711147
Value *GeneratedValue = generate(State);
11721148
if (!hasResult())
11731149
return;
11741150
assert(GeneratedValue && "generate must produce a value");
1151+
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1152+
(vputils::onlyFirstLaneUsed(this) ||
1153+
isVectorToScalar() || isSingleScalar());
11751154
assert((((GeneratedValue->getType()->isVectorTy() ||
11761155
GeneratedValue->getType()->isStructTy()) ==
11771156
!GeneratesPerFirstLaneOnly) ||
@@ -1244,6 +1223,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
12441223
case VPInstruction::Broadcast:
12451224
case VPInstruction::ReductionStartVector:
12461225
return true;
1226+
case VPInstruction::BuildStructVector:
1227+
case VPInstruction::BuildVector:
1228+
return getNumOperands() > 1;
12471229
case VPInstruction::PtrAdd:
12481230
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
12491231
case VPInstruction::WidePtrAdd:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3377,34 +3377,40 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
33773377
vp_depth_first_shallow(Plan.getEntry()));
33783378
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
33793379
vp_depth_first_shallow(LoopRegion->getEntry()));
3380-
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3381-
// excluding ones in replicate regions. Those are not materialized explicitly
3382-
// yet. Those vector users are still handled in VPReplicateRegion::execute(),
3383-
// via shouldPack().
3380+
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3381+
// VPInstructions, excluding ones in replicate regions. Those are not
3382+
// materialized explicitly yet. Those vector users are still handled in
3383+
// VPReplicateRegion::execute(), via shouldPack().
33843384
// TODO: materialize build vectors for replicating recipes in replicating
33853385
// regions.
33863386
// TODO: materialize build vectors for VPInstructions.
33873387
for (VPBasicBlock *VPBB :
33883388
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
33893389
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3390-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3391-
auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3390+
auto *DefR = dyn_cast<VPRecipeWithIRFlags>(&R);
3391+
if (!DefR || !isa<VPReplicateRecipe, VPInstruction>(DefR))
3392+
continue;
3393+
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
33923394
VPRegionBlock *ParentRegion =
33933395
cast<VPRecipeBase>(U)->getParent()->getParent();
3394-
return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3396+
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
33953397
};
3396-
if (!RepR || RepR->isSingleScalar() ||
3397-
none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3398+
if ((isa<VPReplicateRecipe>(DefR) &&
3399+
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3400+
(isa<VPInstruction>(DefR) &&
3401+
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes()) ||
3402+
vputils::onlyFirstLaneUsed(DefR) ||
3403+
none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
33983404
continue;
33993405

3400-
Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3406+
Type *ScalarTy = TypeInfo.inferScalarType(DefR);
34013407
unsigned Opcode = ScalarTy->isStructTy()
34023408
? VPInstruction::BuildStructVector
34033409
: VPInstruction::BuildVector;
3404-
auto *BuildVector = new VPInstruction(Opcode, {RepR});
3405-
BuildVector->insertAfter(RepR);
3410+
auto *BuildVector = new VPInstruction(Opcode, {DefR});
3411+
BuildVector->insertAfter(DefR);
34063412

3407-
RepR->replaceUsesWithIf(
3413+
DefR->replaceUsesWithIf(
34083414
BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
34093415
VPUser &U, unsigned) {
34103416
return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ struct VPlanTransforms {
158158
/// Explicitly unroll \p Plan by \p UF.
159159
static void unrollByUF(VPlan &Plan, unsigned UF);
160160

161-
/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
162-
/// with \p VF single-scalar recipes.
161+
/// Replace each VPReplicateRecipe and replicating VPInstruction outside on
162+
/// any replicate region in \p Plan with \p VF single-scalar recipes.
163163
/// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
164164
/// dissolving the latter.
165165
static void replicateByVF(VPlan &Plan, ElementCount VF);

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -463,15 +463,15 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
463463
VPlanTransforms::removeDeadRecipes(Plan);
464464
}
465465

466-
/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
467-
/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
468-
static VPReplicateRecipe *
466+
/// Create a single-scalar clone of \p DefR for lane \p Lane. Use \p
467+
/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
468+
static VPRecipeWithIRFlags *
469469
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
470-
VPReplicateRecipe *RepR, VPLane Lane,
470+
VPRecipeWithIRFlags *DefR, VPLane Lane,
471471
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
472472
// Collect the operands at Lane, creating extracts as needed.
473473
SmallVector<VPValue *> NewOps;
474-
for (VPValue *Op : RepR->operands()) {
474+
for (VPValue *Op : DefR->operands()) {
475475
// If Op is a definition that has been unrolled, directly use the clone for
476476
// the corresponding lane.
477477
auto LaneDefs = Def2LaneDefs.find(Op);
@@ -501,11 +501,19 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
501501
NewOps.push_back(Ext);
502502
}
503503

504-
auto *New =
505-
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
506-
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
507-
New->transferFlags(*RepR);
508-
New->insertBefore(RepR);
504+
VPRecipeWithIRFlags *New;
505+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
506+
New =
507+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
508+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
509+
} else {
510+
New = DefR->clone();
511+
for (const auto &[Idx, Op] : enumerate(NewOps)) {
512+
New->setOperand(Idx, Op);
513+
}
514+
}
515+
New->transferFlags(*DefR);
516+
New->insertBefore(DefR);
509517
return New;
510518
}
511519

@@ -530,41 +538,46 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
530538
SmallVector<VPRecipeBase *> ToRemove;
531539
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
532540
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
533-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
534-
if (!RepR || RepR->isSingleScalar())
541+
auto *DefR = dyn_cast<VPRecipeWithIRFlags>(&R);
542+
if (!DefR || !isa<VPInstruction, VPReplicateRecipe>(DefR))
543+
continue;
544+
if ((isa<VPReplicateRecipe>(DefR) &&
545+
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
546+
(isa<VPInstruction>(DefR) &&
547+
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes()))
535548
continue;
536549

537-
VPBuilder Builder(RepR);
538-
if (RepR->getNumUsers() == 0) {
539-
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
540-
vputils::isSingleScalar(RepR->getOperand(1))) {
550+
VPBuilder Builder(DefR);
551+
if (DefR->getNumUsers() == 0) {
552+
if (isa<StoreInst>(DefR->getUnderlyingInstr()) &&
553+
vputils::isSingleScalar(DefR->getOperand(1))) {
541554
// Stores to invariant addresses need to store the last lane only.
542-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
555+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane::getLastLaneForVF(VF),
543556
Def2LaneDefs);
544557
} else {
545-
// Create single-scalar version of RepR for all lanes.
558+
// Create single-scalar version of DefR for all lanes.
546559
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
547-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
560+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
548561
}
549-
RepR->eraseFromParent();
562+
DefR->eraseFromParent();
550563
continue;
551564
}
552-
/// Create single-scalar version of RepR for all lanes.
565+
/// Create single-scalar version of DefR for all lanes.
553566
SmallVector<VPValue *> LaneDefs;
554567
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
555568
LaneDefs.push_back(
556-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
569+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
557570

558-
Def2LaneDefs[RepR] = LaneDefs;
571+
Def2LaneDefs[DefR] = LaneDefs;
559572
/// Users that only demand the first lane can use the definition for lane
560573
/// 0.
561-
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
562-
return U.onlyFirstLaneUsed(RepR);
574+
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
575+
return U.onlyFirstLaneUsed(DefR);
563576
});
564577

565-
// Update each build vector user that currently has RepR as its only
578+
// Update each build vector user that currently has DefR as its only
566579
// operand, to have all LaneDefs as its operands.
567-
for (VPUser *U : to_vector(RepR->users())) {
580+
for (VPUser *U : to_vector(DefR->users())) {
568581
auto *VPI = dyn_cast<VPInstruction>(U);
569582
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
570583
VPI->getOpcode() != VPInstruction::BuildStructVector))
@@ -576,7 +589,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
576589
for (VPValue *LaneDef : drop_begin(LaneDefs))
577590
VPI->addOperand(LaneDef);
578591
}
579-
ToRemove.push_back(RepR);
592+
ToRemove.push_back(DefR);
580593
}
581594
}
582595
for (auto *R : reverse(ToRemove))

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ define void @a(ptr readnone %b) {
3333
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
3434
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]]
3535
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]]
36+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
37+
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x ptr> [[TMP21]], ptr [[NEXT_GEP2]], i32 1
38+
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x ptr> [[TMP22]], ptr [[NEXT_GEP3]], i32 2
39+
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x ptr> [[TMP23]], ptr [[NEXT_GEP4]], i32 3
3640
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1
3741
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
3842
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
@@ -649,9 +653,6 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
649653
; STRIDED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], [[TMP8]]
650654
; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP1]]
651655
; STRIDED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]]
652-
; STRIDED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP5]]
653-
; STRIDED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP7]]
654-
; STRIDED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP9]]
655656
; STRIDED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
656657
; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
657658
; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8

0 commit comments

Comments
 (0)