Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3925,7 +3925,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;

VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
Expand Down Expand Up @@ -4182,7 +4183,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
*CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
Expand Down Expand Up @@ -6871,7 +6873,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Expand Down Expand Up @@ -7082,7 +7084,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
*CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
Expand Down Expand Up @@ -8704,7 +8707,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
Expand Down Expand Up @@ -10043,7 +10047,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
CM.CostKind);
CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
Expand Down
11 changes: 8 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1778,8 +1778,10 @@ VPCostContext::getOperandInfo(VPValue *V) const {
return TTI::getOperandInfo(V->getLiveInIRValue());
}

InstructionCost VPCostContext::getScalarizationOverhead(
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
InstructionCost
VPCostContext::getScalarizationOverhead(Type *ResultTy,
ArrayRef<const VPValue *> Operands,
ElementCount VF, bool Skip) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the variable be more descriptive, e.g. something like SkipNonReplicatingLoadOpCost? The reason I say this is because even when Skip is true we don't actually skip if it's a VPReplicateRecipe with opcode Load.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, I renamed it in the header but not here. Updated to AlwaysIncludeReplicatingR, thanks

if (VF.isScalar())
return 0;

Expand All @@ -1799,7 +1801,10 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
if (Op->isLiveIn() ||
(!Skip && isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
(isa<VPReplicateRecipe>(Op) &&
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
Expand Down
16 changes: 10 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
ScalarEvolution &SE;

VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind)
TargetTransformInfo::TargetCostKind CostKind,
ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
CostKind(CostKind) {}
CostKind(CostKind), SE(SE) {}

/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
Expand All @@ -374,10 +376,12 @@ struct VPCostContext {

/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
/// type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Type *ResultTy,
ArrayRef<const VPValue *> Operands,
ElementCount VF);
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
/// is true, always compute the cost of scalarizing replicating operands.
InstructionCost
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
ElementCount VF,
bool AlwaysIncludeReplicatingR = false);
};

/// This class can be used to assign names to VPValues. For VPValues without
Expand Down
116 changes: 103 additions & 13 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3075,6 +3075,63 @@ bool VPReplicateRecipe::shouldPack() const {
});
}

/// Returns true if \p Ptr is a pointer computation for which the legacy cost
/// model computes a SCEV expression when comping the address cost.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Should this be when computing the address cost.?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep fixed, thanks

static bool shouldUseAddressAccessSCEV(VPValue *Ptr) {
auto *PtrR = Ptr->getDefiningRecipe();
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
Instruction::GetElementPtr) ||
isa<VPWidenGEPRecipe>(PtrR)))
return false;

// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Does the comment need updating here as it doesn't seem to match the code? What the code really seems to do is "make sure all operands are either loop invariants or induction variables", i.e. there can be more than one induction variable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, updated thanks! (original comment was just copied).

unsigned NumOperands = PtrR->getNumOperands();
for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
VPValue *Opd = PtrR->getOperand(Idx);
if (!(Opd->isDefinedOutsideLoopRegions()) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I think you can drop the brackets around Opd->isDefinedOutsideLoopRegions()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, removed thanks!

!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
return false;
}

return true;
}

/// Returns true of \p V is used as part of the address of another load or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Return true if \p ...?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks

/// store.
static bool isUsedByLoadStoreAddress(const VPUser *V) {
SmallPtrSet<const VPUser *, 4> Seen;
SmallVector<const VPUser *> WorkList = {V};

while (!WorkList.empty()) {
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
if (!Cur || !Seen.insert(Cur).second)
continue;

for (VPUser *U : Cur->users()) {
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(U))
if (InterleaveR->getAddr() == Cur)
return true;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
if (RepR->getOpcode() == Instruction::Load &&
RepR->getOperand(0) == Cur)
return true;
if (RepR->getOpcode() == Instruction::Store &&
RepR->getOperand(1) == Cur)
return true;
}
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to worry about histogram recipes too?

Also, do we have to worry about the EVL recipes such as VPInterleaveEVLRecipe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the code to check for VPInterleaveBase. VPWidenMemoryRecipe should include both EVL and non EVL variants.

I don't think we need to handle histogram recipes here, as the address must be an AddRec in the current loop, so should not be able to depend on a load in the loop.

if (MemR->getAddr() == Cur && MemR->isConsecutive())
return true;
}
}

append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
}
return false;
}

InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
Expand Down Expand Up @@ -3182,21 +3239,54 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
if (isSingleScalar()) {
bool IsLoad = UI->getOpcode() == Instruction::Load;
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
const Align Alignment = getLoadStoreAlignment(UI);
unsigned AS = getLoadStoreAddressSpace(UI);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
}
if (VF.isScalable() && !isSingleScalar())
return InstructionCost::getInvalid();

// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
break;
if (getParent()->getParent() && getParent()->getParent()->isReplicator())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Might be good to create a variable for getParent()->getParent() so it's clearer what it actually is. Is this supposed to be a VPRegionBlock?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, updated thanks!

break;

bool IsLoad = UI->getOpcode() == Instruction::Load;
// TODO: Handle cases where we need to pass a SCEV to
// getAddressComputationCost.
if (shouldUseAddressAccessSCEV(getOperand(!IsLoad)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Sorry, didn't see this before. Since you're reusing the ptr operand below as well (see Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1))) perhaps cleaner to create a variable for it?

  VPValue *PtrOp = getOperand(!IsLoad);
  if (shouldUseAddressAccessSCEV(PtrOp))
...
  Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

break;

Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
const Align Alignment = getLoadStoreAlignment(UI);
unsigned AS = getLoadStoreAddressSpace(UI);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);

Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);

InstructionCost ScalarCost =
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;

SmallVector<const VPValue *> OpsToScalarize;
Type *ResultTy = Type::getVoidTy(getParent()->getPlan()->getContext());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can get the context more easily from another type, i.e.

  Type *ResultTy = Type::getVoidTy(PtrTy->getContext());

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
// don't assign scalarization overhead in general, if the target prefers
// vectorized addressing or the loaded value is used as part of an address
// of another load or store.
if (Ctx.TTI.prefersVectorizedAddressing() ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps create a variable to keep the result of Ctx.TTI.prefersVectorizedAddressing, since it's also used below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

!isUsedByLoadStoreAddress(this)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is only a minor thing, but it feels more logical to ask this question first, i.e. (!isUsedByLoadStoreAddress(this) || Ctx.TTI.prefersVectorizedAddressing()). It sounds like what we're saying is that if the result is used to calculate the address for a subsequent load or store then only calculate the scalarisation overhead if the target prefers vectorised addressing, since in this case we'll need to do inserts/extracts. Is that a correct understanding?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

t sounds like what we're saying is that if the result is used to calculate the address for a subsequent load or store then only calculate the scalarisation overhead if the target prefers vectorised addressing, since in this case we'll need to do inserts/extracts. Is that a correct understanding?

Yep exactly. The reason for checking TTI first is that it is likely cheaper than isUsedByLoadStoreAddress which needs to work the use list. I kept the order as-is for now.

if (!(IsLoad && !Ctx.TTI.prefersVectorizedAddressing()) &&
!(!IsLoad && Ctx.TTI.supportsEfficientVectorElementLoadStore()))
append_range(OpsToScalarize, operands());

if (!Ctx.TTI.supportsEfficientVectorElementLoadStore())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep a copy of the result from the first invocation in a variable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

ResultTy = Ctx.Types.inferScalarType(this);
}

return (ScalarCost * VF.getFixedValue()) +
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}

Expand Down