| 
40 | 40 | #include <cassert>  | 
41 | 41 | 
 
  | 
42 | 42 | using namespace llvm;  | 
 | 43 | +using namespace llvm::VPlanPatternMatch;  | 
43 | 44 | 
 
  | 
44 | 45 | using VectorParts = SmallVector<Value *, 2>;  | 
45 | 46 | 
 
  | 
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,  | 
303 | 304 |   VPRecipeBase *OpR = Op->getDefiningRecipe();  | 
304 | 305 | 
 
  | 
305 | 306 |   // If the partial reduction is predicated, a select will be operand 0  | 
306 |  | -  using namespace llvm::VPlanPatternMatch;  | 
307 | 307 |   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {  | 
308 | 308 |     OpR = Op->getDefiningRecipe();  | 
309 | 309 |   }  | 
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,  | 
1963 | 1963 |   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);  | 
1964 | 1964 | 
 
  | 
1965 | 1965 |   VPValue *Op0, *Op1;  | 
1966 |  | -  using namespace llvm::VPlanPatternMatch;  | 
1967 | 1966 |   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&  | 
1968 | 1967 |       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||  | 
1969 | 1968 |        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {  | 
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {  | 
3111 | 3110 |   });  | 
3112 | 3111 | }  | 
3113 | 3112 | 
 
  | 
 | 3113 | +/// Returns true if \p Ptr is a pointer computation for which the legacy cost  | 
 | 3114 | +/// model computes a SCEV expression when computing the address cost.  | 
 | 3115 | +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {  | 
 | 3116 | +  auto *PtrR = Ptr->getDefiningRecipe();  | 
 | 3117 | +  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&  | 
 | 3118 | +                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==  | 
 | 3119 | +                      Instruction::GetElementPtr) ||  | 
 | 3120 | +                 isa<VPWidenGEPRecipe>(PtrR) ||  | 
 | 3121 | +                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))  | 
 | 3122 | +    return false;  | 
 | 3123 | + | 
 | 3124 | +  // We are looking for a GEP where all indices are either loop invariant or  | 
 | 3125 | +  // inductions.  | 
 | 3126 | +  for (VPValue *Opd : drop_begin(PtrR->operands())) {  | 
 | 3127 | +    if (!Opd->isDefinedOutsideLoopRegions() &&  | 
 | 3128 | +        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))  | 
 | 3129 | +      return false;  | 
 | 3130 | +  }  | 
 | 3131 | + | 
 | 3132 | +  return true;  | 
 | 3133 | +}  | 
 | 3134 | + | 
 | 3135 | +/// Returns true if \p V is used as part of the address of another load or  | 
 | 3136 | +/// store.  | 
 | 3137 | +static bool isUsedByLoadStoreAddress(const VPUser *V) {  | 
 | 3138 | +  SmallPtrSet<const VPUser *, 4> Seen;  | 
 | 3139 | +  SmallVector<const VPUser *> WorkList = {V};  | 
 | 3140 | + | 
 | 3141 | +  while (!WorkList.empty()) {  | 
 | 3142 | +    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());  | 
 | 3143 | +    if (!Cur || !Seen.insert(Cur).second)  | 
 | 3144 | +      continue;  | 
 | 3145 | + | 
 | 3146 | +    for (VPUser *U : Cur->users()) {  | 
 | 3147 | +      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))  | 
 | 3148 | +        if (InterleaveR->getAddr() == Cur)  | 
 | 3149 | +          return true;  | 
 | 3150 | +      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {  | 
 | 3151 | +        if (RepR->getOpcode() == Instruction::Load &&  | 
 | 3152 | +            RepR->getOperand(0) == Cur)  | 
 | 3153 | +          return true;  | 
 | 3154 | +        if (RepR->getOpcode() == Instruction::Store &&  | 
 | 3155 | +            RepR->getOperand(1) == Cur)  | 
 | 3156 | +          return true;  | 
 | 3157 | +      }  | 
 | 3158 | +      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {  | 
 | 3159 | +        if (MemR->getAddr() == Cur && MemR->isConsecutive())  | 
 | 3160 | +          return true;  | 
 | 3161 | +      }  | 
 | 3162 | +    }  | 
 | 3163 | + | 
 | 3164 | +    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());  | 
 | 3165 | +  }  | 
 | 3166 | +  return false;  | 
 | 3167 | +}  | 
 | 3168 | + | 
3114 | 3169 | InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,  | 
3115 | 3170 |                                                VPCostContext &Ctx) const {  | 
3116 | 3171 |   Instruction *UI = cast<Instruction>(getUnderlyingValue());  | 
@@ -3218,21 +3273,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,  | 
3218 | 3273 |   }  | 
3219 | 3274 |   case Instruction::Load:  | 
3220 | 3275 |   case Instruction::Store: {  | 
3221 |  | -    if (isSingleScalar()) {  | 
3222 |  | -      bool IsLoad = UI->getOpcode() == Instruction::Load;  | 
3223 |  | -      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));  | 
3224 |  | -      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));  | 
3225 |  | -      const Align Alignment = getLoadStoreAlignment(UI);  | 
3226 |  | -      unsigned AS = getLoadStoreAddressSpace(UI);  | 
3227 |  | -      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));  | 
3228 |  | -      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(  | 
3229 |  | -          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);  | 
3230 |  | -      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(  | 
3231 |  | -                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);  | 
3232 |  | -    }  | 
 | 3276 | +    if (VF.isScalable() && !isSingleScalar())  | 
 | 3277 | +      return InstructionCost::getInvalid();  | 
 | 3278 | + | 
3233 | 3279 |     // TODO: See getMemInstScalarizationCost for how to handle replicating and  | 
3234 | 3280 |     // predicated cases.  | 
3235 |  | -    break;  | 
 | 3281 | +    const VPRegionBlock *ParentRegion = getParent()->getParent();  | 
 | 3282 | +    if (ParentRegion && ParentRegion->isReplicator())  | 
 | 3283 | +      break;  | 
 | 3284 | + | 
 | 3285 | +    bool IsLoad = UI->getOpcode() == Instruction::Load;  | 
 | 3286 | +    const VPValue *PtrOp = getOperand(!IsLoad);  | 
 | 3287 | +    // TODO: Handle cases where we need to pass a SCEV to  | 
 | 3288 | +    // getAddressComputationCost.  | 
 | 3289 | +    if (shouldUseAddressAccessSCEV(PtrOp))  | 
 | 3290 | +      break;  | 
 | 3291 | + | 
 | 3292 | +    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));  | 
 | 3293 | +    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);  | 
 | 3294 | +    const Align Alignment = getLoadStoreAlignment(UI);  | 
 | 3295 | +    unsigned AS = getLoadStoreAddressSpace(UI);  | 
 | 3296 | +    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));  | 
 | 3297 | +    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(  | 
 | 3298 | +        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);  | 
 | 3299 | + | 
 | 3300 | +    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);  | 
 | 3301 | + | 
 | 3302 | +    InstructionCost ScalarCost =  | 
 | 3303 | +        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(  | 
 | 3304 | +                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);  | 
 | 3305 | +    if (isSingleScalar())  | 
 | 3306 | +      return ScalarCost;  | 
 | 3307 | + | 
 | 3308 | +    SmallVector<const VPValue *> OpsToScalarize;  | 
 | 3309 | +    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());  | 
 | 3310 | +    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we  | 
 | 3311 | +    // don't assign scalarization overhead in general, if the target prefers  | 
 | 3312 | +    // vectorized addressing or the loaded value is used as part of an address  | 
 | 3313 | +    // of another load or store.  | 
 | 3314 | +    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();  | 
 | 3315 | +    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {  | 
 | 3316 | +      bool EfficientVectorLoadStore =  | 
 | 3317 | +          Ctx.TTI.supportsEfficientVectorElementLoadStore();  | 
 | 3318 | +      if (!(IsLoad && !PreferVectorizedAddressing) &&  | 
 | 3319 | +          !(!IsLoad && EfficientVectorLoadStore))  | 
 | 3320 | +        append_range(OpsToScalarize, operands());  | 
 | 3321 | + | 
 | 3322 | +      if (!EfficientVectorLoadStore)  | 
 | 3323 | +        ResultTy = Ctx.Types.inferScalarType(this);  | 
 | 3324 | +    }  | 
 | 3325 | + | 
 | 3326 | +    return (ScalarCost * VF.getFixedValue()) +  | 
 | 3327 | +           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);  | 
3236 | 3328 |   }  | 
3237 | 3329 |   }  | 
3238 | 3330 | 
 
  | 
 | 
0 commit comments