|
40 | 40 | #include <cassert>
|
41 | 41 |
|
42 | 42 | using namespace llvm;
|
| 43 | +using namespace llvm::VPlanPatternMatch; |
43 | 44 |
|
44 | 45 | using VectorParts = SmallVector<Value *, 2>;
|
45 | 46 |
|
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
|
303 | 304 | VPRecipeBase *OpR = Op->getDefiningRecipe();
|
304 | 305 |
|
305 | 306 | // If the partial reduction is predicated, a select will be operand 0
|
306 |
| - using namespace llvm::VPlanPatternMatch; |
307 | 307 | if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
|
308 | 308 | OpR = Op->getDefiningRecipe();
|
309 | 309 | }
|
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
|
1963 | 1963 | Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
|
1964 | 1964 |
|
1965 | 1965 | VPValue *Op0, *Op1;
|
1966 |
| - using namespace llvm::VPlanPatternMatch; |
1967 | 1966 | if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
|
1968 | 1967 | (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
|
1969 | 1968 | match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
|
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
|
3111 | 3110 | });
|
3112 | 3111 | }
|
3113 | 3112 |
|
| 3113 | +/// Returns true if \p Ptr is a pointer computation for which the legacy cost |
| 3114 | +/// model computes a SCEV expression when computing the address cost. |
| 3115 | +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { |
| 3116 | + auto *PtrR = Ptr->getDefiningRecipe(); |
| 3117 | + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && |
| 3118 | + cast<VPReplicateRecipe>(PtrR)->getOpcode() == |
| 3119 | + Instruction::GetElementPtr) || |
| 3120 | + isa<VPWidenGEPRecipe>(PtrR) || |
| 3121 | + match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) |
| 3122 | + return false; |
| 3123 | + |
| 3124 | + // We are looking for a GEP where all indices are either loop invariant or |
| 3125 | + // inductions. |
| 3126 | + for (VPValue *Opd : drop_begin(PtrR->operands())) { |
| 3127 | + if (!Opd->isDefinedOutsideLoopRegions() && |
| 3128 | + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) |
| 3129 | + return false; |
| 3130 | + } |
| 3131 | + |
| 3132 | + return true; |
| 3133 | +} |
| 3134 | + |
| 3135 | +/// Returns true if \p V is used as part of the address of another load or |
| 3136 | +/// store. |
| 3137 | +static bool isUsedByLoadStoreAddress(const VPUser *V) { |
| 3138 | + SmallPtrSet<const VPUser *, 4> Seen; |
| 3139 | + SmallVector<const VPUser *> WorkList = {V}; |
| 3140 | + |
| 3141 | + while (!WorkList.empty()) { |
| 3142 | + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); |
| 3143 | + if (!Cur || !Seen.insert(Cur).second) |
| 3144 | + continue; |
| 3145 | + |
| 3146 | + for (VPUser *U : Cur->users()) { |
| 3147 | + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) |
| 3148 | + if (InterleaveR->getAddr() == Cur) |
| 3149 | + return true; |
| 3150 | + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { |
| 3151 | + if (RepR->getOpcode() == Instruction::Load && |
| 3152 | + RepR->getOperand(0) == Cur) |
| 3153 | + return true; |
| 3154 | + if (RepR->getOpcode() == Instruction::Store && |
| 3155 | + RepR->getOperand(1) == Cur) |
| 3156 | + return true; |
| 3157 | + } |
| 3158 | + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { |
| 3159 | + if (MemR->getAddr() == Cur && MemR->isConsecutive()) |
| 3160 | + return true; |
| 3161 | + } |
| 3162 | + } |
| 3163 | + |
| 3164 | + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); |
| 3165 | + } |
| 3166 | + return false; |
| 3167 | +} |
| 3168 | + |
3114 | 3169 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
|
3115 | 3170 | VPCostContext &Ctx) const {
|
3116 | 3171 | Instruction *UI = cast<Instruction>(getUnderlyingValue());
|
@@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
|
3218 | 3273 | }
|
3219 | 3274 | case Instruction::Load:
|
3220 | 3275 | case Instruction::Store: {
|
3221 |
| - if (isSingleScalar()) { |
3222 |
| - bool IsLoad = UI->getOpcode() == Instruction::Load; |
3223 |
| - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); |
3224 |
| - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); |
3225 |
| - const Align Alignment = getLoadStoreAlignment(UI); |
3226 |
| - unsigned AS = getLoadStoreAddressSpace(UI); |
3227 |
| - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); |
3228 |
| - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( |
3229 |
| - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); |
3230 |
| - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( |
3231 |
| - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); |
3232 |
| - } |
| 3276 | + if (VF.isScalable() && !isSingleScalar()) |
| 3277 | + return InstructionCost::getInvalid(); |
| 3278 | + |
3233 | 3279 | // TODO: See getMemInstScalarizationCost for how to handle replicating and
|
3234 | 3280 | // predicated cases.
|
3235 |
| - break; |
| 3281 | + const VPRegionBlock *ParentRegion = getParent()->getParent(); |
| 3282 | + if (ParentRegion && ParentRegion->isReplicator()) |
| 3283 | + break; |
| 3284 | + |
| 3285 | + bool IsLoad = UI->getOpcode() == Instruction::Load; |
| 3286 | + const VPValue *PtrOp = getOperand(!IsLoad); |
| 3287 | + // TODO: Handle cases where we need to pass a SCEV to |
| 3288 | + // getAddressComputationCost. |
| 3289 | + if (shouldUseAddressAccessSCEV(PtrOp)) |
| 3290 | + break; |
| 3291 | + |
| 3292 | + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); |
| 3293 | + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); |
| 3294 | + const Align Alignment = getLoadStoreAlignment(UI); |
| 3295 | + unsigned AS = getLoadStoreAddressSpace(UI); |
| 3296 | + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); |
| 3297 | + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( |
| 3298 | + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); |
| 3299 | + |
| 3300 | + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); |
| 3301 | + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); |
| 3302 | + bool UsedByLoadStoreAddress = |
| 3303 | + !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); |
| 3304 | + InstructionCost ScalarCost = |
| 3305 | + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( |
| 3306 | + PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, |
| 3307 | + nullptr, Ctx.CostKind); |
| 3308 | + if (isSingleScalar()) |
| 3309 | + return ScalarCost; |
| 3310 | + |
| 3311 | + SmallVector<const VPValue *> OpsToScalarize; |
| 3312 | + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); |
| 3313 | + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we |
| 3314 | + // don't assign scalarization overhead in general, if the target prefers |
| 3315 | + // vectorized addressing or the loaded value is used as part of an address |
| 3316 | + // of another load or store. |
| 3317 | + if (!UsedByLoadStoreAddress) { |
| 3318 | + bool EfficientVectorLoadStore = |
| 3319 | + Ctx.TTI.supportsEfficientVectorElementLoadStore(); |
| 3320 | + if (!(IsLoad && !PreferVectorizedAddressing) && |
| 3321 | + !(!IsLoad && EfficientVectorLoadStore)) |
| 3322 | + append_range(OpsToScalarize, operands()); |
| 3323 | + |
| 3324 | + if (!EfficientVectorLoadStore) |
| 3325 | + ResultTy = Ctx.Types.inferScalarType(this); |
| 3326 | + } |
| 3327 | + |
| 3328 | + return (ScalarCost * VF.getFixedValue()) + |
| 3329 | + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); |
3236 | 3330 | }
|
3237 | 3331 | }
|
3238 | 3332 |
|
|
0 commit comments