Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;

VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
Expand Down Expand Up @@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
*CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
Expand Down Expand Up @@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Expand Down Expand Up @@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
*CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
Expand Down Expand Up @@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
Expand Down Expand Up @@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
CM.CostKind);
CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
}

InstructionCost VPCostContext::getScalarizationOverhead(
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
bool AlwaysIncludeReplicatingR) {
if (VF.isScalar())
return 0;

Expand All @@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
if (Op->isLiveIn() ||
(!AlwaysIncludeReplicatingR &&
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
(isa<VPReplicateRecipe>(Op) &&
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
Expand Down
16 changes: 10 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
ScalarEvolution &SE;

VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind)
TargetTransformInfo::TargetCostKind CostKind,
ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
CostKind(CostKind) {}
CostKind(CostKind), SE(SE) {}

/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
Expand All @@ -374,10 +376,12 @@ struct VPCostContext {

/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
/// type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Type *ResultTy,
ArrayRef<const VPValue *> Operands,
ElementCount VF);
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
/// is true, always compute the cost of scalarizing replicating operands.
InstructionCost
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
ElementCount VF,
bool AlwaysIncludeReplicatingR = false);
};

/// This class can be used to assign names to VPValues. For VPValues without
Expand Down
124 changes: 109 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <cassert>

using namespace llvm;
using namespace llvm::VPlanPatternMatch;

using VectorParts = SmallVector<Value *, 2>;

Expand Down Expand Up @@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
VPRecipeBase *OpR = Op->getDefiningRecipe();

// If the partial reduction is predicated, a select will be operand 0
using namespace llvm::VPlanPatternMatch;
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
OpR = Op->getDefiningRecipe();
}
Expand Down Expand Up @@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);

VPValue *Op0, *Op1;
using namespace llvm::VPlanPatternMatch;
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
Expand Down Expand Up @@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
});
}

/// Returns true if \p Ptr is a pointer computation for which the legacy cost
/// model computes a SCEV expression when computing the address cost.
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
auto *PtrR = Ptr->getDefiningRecipe();
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
Instruction::GetElementPtr) ||
isa<VPWidenGEPRecipe>(PtrR) ||
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
return false;

// We are looking for a GEP where all indices are either loop invariant or
// inductions.
for (VPValue *Opd : drop_begin(PtrR->operands())) {
if (!Opd->isDefinedOutsideLoopRegions() &&
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
return false;
}

return true;
}

/// Returns true if \p V is used as part of the address of another load or
/// store.
static bool isUsedByLoadStoreAddress(const VPUser *V) {
SmallPtrSet<const VPUser *, 4> Seen;
SmallVector<const VPUser *> WorkList = {V};

while (!WorkList.empty()) {
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
if (!Cur || !Seen.insert(Cur).second)
continue;

for (VPUser *U : Cur->users()) {
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
if (InterleaveR->getAddr() == Cur)
return true;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
if (RepR->getOpcode() == Instruction::Load &&
RepR->getOperand(0) == Cur)
return true;
if (RepR->getOpcode() == Instruction::Store &&
RepR->getOperand(1) == Cur)
return true;
}
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
if (MemR->getAddr() == Cur && MemR->isConsecutive())
return true;
}
}

append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
}
return false;
}

InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
Expand Down Expand Up @@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
if (isSingleScalar()) {
bool IsLoad = UI->getOpcode() == Instruction::Load;
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
const Align Alignment = getLoadStoreAlignment(UI);
unsigned AS = getLoadStoreAddressSpace(UI);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
}
if (VF.isScalable() && !isSingleScalar())
return InstructionCost::getInvalid();

// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
break;
const VPRegionBlock *ParentRegion = getParent()->getParent();
if (ParentRegion && ParentRegion->isReplicator())
break;

bool IsLoad = UI->getOpcode() == Instruction::Load;
const VPValue *PtrOp = getOperand(!IsLoad);
// TODO: Handle cases where we need to pass a SCEV to
// getAddressComputationCost.
if (shouldUseAddressAccessSCEV(PtrOp))
break;

Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
const Align Alignment = getLoadStoreAlignment(UI);
unsigned AS = getLoadStoreAddressSpace(UI);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);

Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
bool UsedByLoadStoreAddress =
!PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
InstructionCost ScalarCost =
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
nullptr, Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;

SmallVector<const VPValue *> OpsToScalarize;
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
// don't assign scalarization overhead in general, if the target prefers
// vectorized addressing or the loaded value is used as part of an address
// of another load or store.
if (!UsedByLoadStoreAddress) {
bool EfficientVectorLoadStore =
Ctx.TTI.supportsEfficientVectorElementLoadStore();
if (!(IsLoad && !PreferVectorizedAddressing) &&
!(!IsLoad && EfficientVectorLoadStore))
append_range(OpsToScalarize, operands());

if (!EfficientVectorLoadStore)
ResultTy = Ctx.Types.inferScalarType(this);
}

return (ScalarCost * VF.getFixedValue()) +
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -p loop-vectorize -S %s | FileCheck %s

target triple = "armv7-unknown-linux-gnueabihf"

define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) {
; CHECK-LABEL: define void @replicating_load_used_by_other_load(
; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ]
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[IV]], 1
; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[IV]], 1
; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[IV]], 2
; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 [[IV]], 1
; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], 2
; CHECK-NEXT: [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]]
; CHECK-NEXT: [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]]
; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]]
; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]]
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1
; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]]
; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[XOR_3]], 1
; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[IV]], 2147483646
; CHECK-NEXT: [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]]
; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[IV]], 254
; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 [[OR_3]], 1
; CHECK-NEXT: [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2
; CHECK-NEXT: [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]]
; CHECK-NEXT: [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]]
; CHECK-NEXT: [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]]
; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[XOR_6]], 255
; CHECK-NEXT: [[XOR_7:%.*]] = xor i32 [[AND_6]], 1
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]]
; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[GEP]], align 1
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD]] to i32
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]]
; CHECK-NEXT: store i32 0, ptr [[GEP_2]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100
; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ]
%shr = lshr i32 %iv, 1
%and.1 = and i32 %iv, 1
%shl.1 = shl i32 %iv, 2
%shl.2 = shl i32 %iv, 1
%and.2 = and i32 %shl.2, 2
%or.1 = or i32 %and.2, %and.1
%or.2 = or i32 %or.1, %shl.1
%xor.1 = xor i32 %b, %or.2
%xor.2 = xor i32 %xor.1, %arg
%shr.2 = lshr i32 %shl.1, 1
%xor.3 = xor i32 %shr, %arg
%and.3 = and i32 %xor.3, 1
%and.4 = and i32 %iv, 2147483646
%or.3 = or i32 %and.3, %and.4
%and.5 = and i32 %iv, 254
%shl.3 = shl i32 %or.3, 1
%xor.4 = xor i32 %shl.3, 2
%or.4 = or i32 %and.5, %xor.4
%xor.5 = xor i32 %shr.2, %or.4
%xor.6 = xor i32 %xor.5, %xor.2
%and.6 = and i32 %xor.6, 255
%xor.7 = xor i32 %and.6, 1
%gep = getelementptr i8, ptr %a, i32 %xor.7
%ld = load i8, ptr %gep, align 1
%zext = zext i8 %ld to i32
%gep.2 = getelementptr i32, ptr null, i32 %zext
store i32 0, ptr %gep.2, align 4
%iv.next = add i32 %iv, 1
%cmp = icmp eq i32 %iv.next, 100
br i1 %cmp, label %exit, label %loop

exit:
ret void
}
Loading