Skip to content
30 changes: 7 additions & 23 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,37 +343,21 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
LastLane = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Independent: above comment deserves updating, as this method no longer has a Part parameter - "// If Values have been set for this Def return the one relevant for \p Part."

}

auto *LastInst = cast<Instruction>(get(Def, LastLane));
// We need to construct the vector value for a single-scalar value by
// broadcasting the scalar to all lanes.
Comment on lines +346 to +347
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// We need to construct the vector value for a single-scalar value by
// broadcasting the scalar to all lanes.
// We need to construct the vector value for a single-scalar value by
// broadcasting the scalar to all lanes.
// TODO: Replace by introducing Broadcast VPInstructions.

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

// TODO: Replace by introducing Broadcast VPInstructions.
assert(IsSingleScalar && "must be a single-scalar at this point");
// Set the insert point after the last scalarized instruction or after the
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
// will directly follow the scalar definitions.
auto OldIP = Builder.saveIP();
auto *LastInst = cast<Instruction>(get(Def, LastLane));
auto NewIP = isa<PHINode>(LastInst)
? LastInst->getParent()->getFirstNonPHIIt()
: std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);

// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
// broadcast the scalar value corresponding to lane zero. Otherwise, we
// construct the vector values using insertelement instructions. Since the
// resulting vectors are stored in State, we will only generate the
// insertelements once.
Value *VectorValue = nullptr;
if (IsSingleScalar) {
VectorValue = GetBroadcastInstrs(ScalarValue);
set(Def, VectorValue);
} else {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
assert(isa<VPInstruction>(Def) &&
"Explicit BuildVector recipes must have"
"handled packing for non-VPInstructions.");
// Initialize packing with insertelements to start from poison.
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
set(Def, VectorValue);
}
Value *VectorValue = GetBroadcastInstrs(ScalarValue);
set(Def, VectorValue);
Builder.restoreIP(OldIP);
return VectorValue;
}
Expand Down
21 changes: 9 additions & 12 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
return R && classof(R);
}

virtual VPRecipeWithIRFlags *clone() override = 0;

static inline bool classof(const VPSingleDefRecipe *U) {
auto *R = dyn_cast<VPRecipeBase>(U);
return R && classof(R);
Expand Down Expand Up @@ -1061,13 +1063,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
VScale,
};

private:
typedef unsigned char OpcodeTy;
OpcodeTy Opcode;

/// An optional name that can be used for the generated IR instruction.
const std::string Name;

/// Returns true if this VPInstruction generates scalar values for all lanes.
/// Most VPInstructions generate a single value per part, either vector or
/// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
Expand All @@ -1076,6 +1071,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// underlying ingredient.
bool doesGeneratePerAllLanes() const;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesGeneratePerAllLanes() moved from private to public.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep


private:
typedef unsigned char OpcodeTy;
OpcodeTy Opcode;

/// An optional name that can be used for the generated IR instruction.
const std::string Name;

/// Returns true if we can generate a scalar for the first lane only if
/// needed.
bool canGenerateScalarForFirstLane() const;
Expand All @@ -1085,11 +1087,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// existing value is returned rather than a generated one.
Value *generate(VPTransformState &State);

/// Utility methods serving execute(): generates a scalar single instance of
/// the modeled instruction for a given lane. \returns the scalar generated
/// value for lane \p Lane.
Value *generatePerLane(VPTransformState &State, const VPLane &Lane);

#if !defined(NDEBUG)
/// Return the number of operands determined by the opcode of the
/// VPInstruction. Returns -1u if the number of operands cannot be determined
Expand Down
33 changes: 9 additions & 24 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,16 +564,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
}
}

Value *VPInstruction::generatePerLane(VPTransformState &State,
const VPLane &Lane) {
IRBuilderBase &Builder = State.Builder;

assert(getOpcode() == VPInstruction::PtrAdd &&
"only PtrAdd opcodes are supported for now");
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
State.get(getOperand(1), Lane), Name);
}

/// Create a conditional branch using \p Cond branching to the successors of \p
/// VPBB. Note that the first successor is always forward (i.e. not created yet)
/// while the second successor may already have been created (if it is a header
Expand Down Expand Up @@ -1197,24 +1187,13 @@ void VPInstruction::execute(VPTransformState &State) {
"Set flags not supported for the provided opcode");
if (hasFastMathFlags())
State.Builder.setFastMathFlags(getFastMathFlags());
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
isVectorToScalar() || isSingleScalar());
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
if (GeneratesPerAllLanes) {
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
Lane != NumLanes; ++Lane) {
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
assert(GeneratedValue && "generatePerLane must produce a value");
State.set(this, GeneratedValue, VPLane(Lane));
}
return;
}
Comment on lines -1203 to -1212
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is dropped due to earlier replication of VPInstructions by VF.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep


Value *GeneratedValue = generate(State);
if (!hasResult())
return;
assert(GeneratedValue && "generate must produce a value");
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
isVectorToScalar() || isSingleScalar());
assert((((GeneratedValue->getType()->isVectorTy() ||
GeneratedValue->getType()->isStructTy()) ==
!GeneratesPerFirstLaneOnly) ||
Expand Down Expand Up @@ -1287,6 +1266,12 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::Broadcast:
case VPInstruction::ReductionStartVector:
return true;
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
// Before replicating by VF, Build(Struct)Vector uses all lanes of the
// operand, after replicating its operands only the first lane is used.
// Before replicating, it will only have a single operand.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Before replicating, it will only have a single operand.
// Before replicating, it will have only a single operand.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks

return getNumOperands() > 1;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::WidePtrAdd:
Expand Down
33 changes: 19 additions & 14 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3695,34 +3695,39 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
vp_depth_first_shallow(Plan.getEntry()));
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(LoopRegion->getEntry()));
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
// excluding ones in replicate regions. Those are not materialized explicitly
// yet. Those vector users are still handled in VPReplicateRegion::execute(),
// via shouldPack().
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
// VPInstructions, excluding ones in replicate regions. Those are not
// materialized explicitly yet. Those vector users are still handled in
// VPReplicateRegion::execute(), via shouldPack().
// TODO: materialize build vectors for replicating recipes in replicating
// regions.
// TODO: materialize build vectors for VPInstructions.
for (VPBasicBlock *VPBB :
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
if (!isa<VPReplicateRecipe, VPInstruction>(&R))
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion =
cast<VPRecipeBase>(U)->getParent()->getParent();
return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if (!RepR || RepR->isSingleScalar() ||
none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
if ((isa<VPReplicateRecipe>(DefR) &&
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
(isa<VPInstruction>(DefR) &&
(vputils::onlyFirstLaneUsed(DefR) ||
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
continue;

Type *ScalarTy = TypeInfo.inferScalarType(RepR);
Type *ScalarTy = TypeInfo.inferScalarType(DefR);
unsigned Opcode = ScalarTy->isStructTy()
? VPInstruction::BuildStructVector
: VPInstruction::BuildVector;
auto *BuildVector = new VPInstruction(Opcode, {RepR});
BuildVector->insertAfter(RepR);
auto *BuildVector = new VPInstruction(Opcode, {DefR});
BuildVector->insertAfter(DefR);

RepR->replaceUsesWithIf(
DefR->replaceUsesWithIf(
BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
VPUser &U, unsigned) {
return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,10 @@ struct VPlanTransforms {
/// Explicitly unroll \p Plan by \p UF.
static void unrollByUF(VPlan &Plan, unsigned UF);

/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
/// with \p VF single-scalar recipes.
/// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
/// dissolving the latter.
/// Replace each replicating VPReplicateRecipe and VPInstruction outside of
/// any replicate region in \p Plan with \p VF single-scalar recipes.
/// TODO: Also replicate VPScalarIVSteps and VPReplicateRecipes inside
/// replicate regions, thereby dissolving the latter.
static void replicateByVF(VPlan &Plan, ElementCount VF);

/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
Expand Down
68 changes: 43 additions & 25 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,15 +463,16 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
VPlanTransforms::removeDeadRecipes(Plan);
}

/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
static VPReplicateRecipe *
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
/// definitions for operands of \DefR.
static VPRecipeWithIRFlags *
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
VPReplicateRecipe *RepR, VPLane Lane,
VPRecipeWithIRFlags *DefR, VPLane Lane,
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : RepR->operands()) {
for (VPValue *Op : DefR->operands()) {
// If Op is a definition that has been unrolled, directly use the clone for
// the corresponding lane.
auto LaneDefs = Def2LaneDefs.find(Op);
Expand Down Expand Up @@ -501,11 +502,24 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
NewOps.push_back(Ext);
}

auto *New =
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
New->transferFlags(*RepR);
New->insertBefore(RepR);
VPRecipeWithIRFlags *New;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
// TODO: have cloning of replicate recipes also provide the desired result
// coupled with setting its operands to NewOps (deriving IsSingleScalar and
// Mask from the operands?)
New =
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
Comment on lines +511 to +512
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: have cloning of replicate recipes also provide the desired result coupled with setting its operands to NewOps (deriving IsSingleScalar and Mask from the operands?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added thanks

} else {
assert(isa<VPInstruction>(DefR) &&
"DefR must be a VPReplicateRecipe or VPInstruction");
New = DefR->clone();
for (const auto &[Idx, Op] : enumerate(NewOps)) {
New->setOperand(Idx, Op);
}
}
New->transferFlags(*DefR);
New->insertBefore(DefR);
return New;
}

Expand All @@ -530,34 +544,38 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
if (!RepR || RepR->isSingleScalar())
if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
continue;

VPBuilder Builder(RepR);
if (RepR->getNumUsers() == 0) {
// Create single-scalar version of RepR for all lanes.
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
VPBuilder Builder(DefR);
if (DefR->getNumUsers() == 0) {
// Create single-scalar version of DefR for all lanes.
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
RepR->eraseFromParent();
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
DefR->eraseFromParent();
continue;
}
/// Create single-scalar version of RepR for all lanes.
/// Create single-scalar version of DefR for all lanes.
SmallVector<VPValue *> LaneDefs;
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
LaneDefs.push_back(
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));

Def2LaneDefs[RepR] = LaneDefs;
Def2LaneDefs[DefR] = LaneDefs;
/// Users that only demand the first lane can use the definition for lane
/// 0.
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
return U.onlyFirstLaneUsed(RepR);
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
return U.onlyFirstLaneUsed(DefR);
});

// Update each build vector user that currently has RepR as its only
// Update each build vector user that currently has DefR as its only
// operand, to have all LaneDefs as its operands.
for (VPUser *U : to_vector(RepR->users())) {
for (VPUser *U : to_vector(DefR->users())) {
auto *VPI = dyn_cast<VPInstruction>(U);
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
VPI->getOpcode() != VPInstruction::BuildStructVector))
Expand All @@ -569,7 +587,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
for (VPValue *LaneDef : drop_begin(LaneDefs))
VPI->addOperand(LaneDef);
}
ToRemove.push_back(RepR);
ToRemove.push_back(DefR);
}
}
for (auto *R : reverse(ToRemove))
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/Transforms/LoopVectorize/pointer-induction.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ define void @a(ptr readnone %b) {
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]]
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x ptr> [[TMP21]], ptr [[NEXT_GEP2]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x ptr> [[TMP22]], ptr [[NEXT_GEP3]], i32 2
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x ptr> [[TMP23]], ptr [[NEXT_GEP4]], i32 3
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How/are these inserts (and those below) related to the patch, perhaps to dropping the packing from VPTransformState::get()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If ther eare users in replicate regions, we will materialize explicit build-vectors for those users, but we will look thourgh them during codegen

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential for being cleaned up when replicate regions will be replicated by VF?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes exactly

; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
Expand Down Expand Up @@ -649,9 +653,6 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
; STRIDED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], [[TMP8]]
; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP1]]
; STRIDED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]]
; STRIDED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP5]]
; STRIDED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP7]]
; STRIDED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP9]]
Comment on lines -652 to -654
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DCE or generation avoided now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will be DCE'd afte unrolling and only the last lane being used

; STRIDED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
Expand Down
Loading
Loading