Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,17 @@ class VPBuilder {
return createScalarCast(CastOp, Op, ResultTy, DL);
}

VPValue *createScalarSExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
DebugLoc DL) {
if (ResultTy == SrcTy)
return Op;
Instruction::CastOps CastOp =
ResultTy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()
? Instruction::Trunc
: Instruction::SExt;
return createScalarCast(CastOp, Op, ResultTy, DL);
}

VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
Type *ResultTy) {
VPIRFlags Flags;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7316,6 +7316,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::unrollReplicateRegions, BestVPlan,
BestVF);
VPlanTransforms::runPass(VPlanTransforms::mergeBlocksIntoPredecessors,
BestVPlan);
bool HasBranchWeights =
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
if (HasBranchWeights) {
Expand Down
16 changes: 8 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3783,9 +3783,9 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
};

/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their scalar values.
class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<3> {
/// producing their scalar values. Before unrolling the recipe has 3 operands:
/// IV, step and VF. Unrolling adds an extra operand StartIndex.
class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
Instruction::BinaryOps InductionOpcode;

public:
Expand All @@ -3809,16 +3809,16 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
~VPScalarIVStepsRecipe() override = default;

VPScalarIVStepsRecipe *clone() override {
return new VPScalarIVStepsRecipe(
auto *NewR = new VPScalarIVStepsRecipe(
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
getDebugLoc());
// Add start index operand, if present.
for (VPValue *Op : drop_begin(operands(), 3))
NewR->addOperand(Op);
return NewR;
}

/// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that
/// this is only accurate after the VPlan has been unrolled.
bool isPart0() const { return getUnrollPart(*this) == 0; }

VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)

/// Generate the scalarized versions of the phi node as needed by their users.
Expand Down
32 changes: 13 additions & 19 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2380,8 +2380,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
// iteration.
bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
// Compute the scalar steps and save the results in State.
Type *IntStepTy =
IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());

unsigned StartLane = 0;
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
Expand All @@ -2390,32 +2388,28 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
EndLane = StartLane + 1;
}
Value *StartIdx0;
if (getUnrollPart(*this) == 0)
StartIdx0 = ConstantInt::get(IntStepTy, 0);
else {
StartIdx0 = State.get(getOperand(2), true);
if (getUnrollPart(*this) != 1) {
StartIdx0 =
Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
getUnrollPart(*this)));
}
StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
}

if (BaseIVTy->isFloatingPointTy())
StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
if (getNumOperands() == 3)
StartIdx0 = getSignedIntOrFpConstant(BaseIVTy, 0);
else
StartIdx0 = State.get(getOperand(3), true);

for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
Value *StartIdx = Builder.CreateBinOp(
AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
Value *StartIdx = StartIdx0;
if (Lane != 0) {
StartIdx = Builder.CreateBinOp(AddOp, StartIdx0,
getSignedIntOrFpConstant(BaseIVTy, Lane));
}
// The step returned by `createStepForVF` is a runtime-evaluated value
// when VF is scalable. Otherwise, it should be folded into a Constant.
assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
State.set(this, Add, VPLane(Lane));
if (State.Lane)
State.set(this, Add, VPLane(Lane));
else
State.set(this, Add, VPLane(0));
}
}

Expand Down
12 changes: 7 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ static void addReplicateRegions(VPlan &Plan) {

/// Remove redundant VPBasicBlocks by merging them into their predecessor if
/// the predecessor has a single successor.
static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
SmallVector<VPBasicBlock *> WorkList;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
Expand Down Expand Up @@ -1457,9 +1457,11 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}

// VPScalarIVSteps for part 0 can be replaced by their start value, if only
// the first lane is demanded.
// the first lane is demanded and both Lane and UnrollPart operands are 0.
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
if ((Steps->getNumOperands() == 3 ||
match(Steps->getOperand(3), m_ZeroInt())) &&
vputils::onlyFirstLaneUsed(Steps)) {
Steps->replaceAllUsesWith(Steps->getOperand(0));
return;
}
Expand Down Expand Up @@ -4434,9 +4436,9 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
for (VPBasicBlock *VPBB :
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (!isa<VPReplicateRecipe, VPInstruction>(&R))
if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R))
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto *DefR = cast<VPSingleDefRecipe>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ struct VPlanTransforms {
/// replicate regions, thereby dissolving the latter.
static void replicateByVF(VPlan &Plan, ElementCount VF);

/// Replace replicate regions by explicitly replicating the regions' contents
/// \p VF times, each copy processing a single lane.
static void unrollReplicateRegions(VPlan &Plan, ElementCount VF);

/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
Expand All @@ -189,6 +193,8 @@ struct VPlanTransforms {
/// block merging.
LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);

static bool mergeBlocksIntoPredecessors(VPlan &Plan);

/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
/// region block and remove the mask operand. Optimize the created regions by
/// iteratively sinking scalar operands into the region, followed by merging
Expand Down
Loading
Loading