@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
30693069 });
30703070}
30713071
3072+ // / Returns true if \p Ptr is a pointer computation for which the legacy cost
3073+ // / model computes a SCEV expression when computing the address cost.
3074+ static bool shouldUseAddressAccessSCEV (const VPValue *Ptr) {
3075+ auto *PtrR = Ptr->getDefiningRecipe ();
3076+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3077+ cast<VPReplicateRecipe>(PtrR)->getOpcode () ==
3078+ Instruction::GetElementPtr) ||
3079+ isa<VPWidenGEPRecipe>(PtrR)))
3080+ return false ;
3081+
3082+ // We are looking for a GEP where all indices are either loop invariant or
3083+ // inductions.
3084+ for (VPValue *Opd : drop_begin (PtrR->operands ())) {
3085+ if (!Opd->isDefinedOutsideLoopRegions () &&
3086+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3087+ return false ;
3088+ }
3089+
3090+ return true ;
3091+ }
3092+
3093+ // / Returns true if \p V is used as part of the address of another load or
3094+ // / store.
3095+ static bool isUsedByLoadStoreAddress (const VPUser *V) {
3096+ SmallPtrSet<const VPUser *, 4 > Seen;
3097+ SmallVector<const VPUser *> WorkList = {V};
3098+
3099+ while (!WorkList.empty ()) {
3100+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val ());
3101+ if (!Cur || !Seen.insert (Cur).second )
3102+ continue ;
3103+
3104+ for (VPUser *U : Cur->users ()) {
3105+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3106+ if (InterleaveR->getAddr () == Cur)
3107+ return true ;
3108+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3109+ if (RepR->getOpcode () == Instruction::Load &&
3110+ RepR->getOperand (0 ) == Cur)
3111+ return true ;
3112+ if (RepR->getOpcode () == Instruction::Store &&
3113+ RepR->getOperand (1 ) == Cur)
3114+ return true ;
3115+ }
3116+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3117+ if (MemR->getAddr () == Cur && MemR->isConsecutive ())
3118+ return true ;
3119+ }
3120+ }
3121+
3122+ append_range (WorkList, cast<VPSingleDefRecipe>(Cur)->users ());
3123+ }
3124+ return false ;
3125+ }
3126+
30723127InstructionCost VPReplicateRecipe::computeCost (ElementCount VF,
30733128 VPCostContext &Ctx) const {
30743129 Instruction *UI = cast<Instruction>(getUnderlyingValue ());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31763231 }
31773232 case Instruction::Load:
31783233 case Instruction::Store: {
3179- if (isSingleScalar ()) {
3180- bool IsLoad = UI->getOpcode () == Instruction::Load;
3181- Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this : getOperand (0 ));
3182- Type *ScalarPtrTy = Ctx.Types .inferScalarType (getOperand (IsLoad ? 0 : 1 ));
3183- const Align Alignment = getLoadStoreAlignment (UI);
3184- unsigned AS = getLoadStoreAddressSpace (UI);
3185- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3186- InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3187- UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo, UI);
3188- return ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3189- ScalarPtrTy, nullptr , nullptr , Ctx.CostKind );
3190- }
3234+ if (VF.isScalable () && !isSingleScalar ())
3235+ return InstructionCost::getInvalid ();
3236+
31913237 // TODO: See getMemInstScalarizationCost for how to handle replicating and
31923238 // predicated cases.
3193- break ;
3239+ const VPRegionBlock *ParentRegion = getParent ()->getParent ();
3240+ if (ParentRegion && ParentRegion->isReplicator ())
3241+ break ;
3242+
3243+ bool IsLoad = UI->getOpcode () == Instruction::Load;
3244+ const VPValue *PtrOp = getOperand (!IsLoad);
3245+ // TODO: Handle cases where we need to pass a SCEV to
3246+ // getAddressComputationCost.
3247+ if (shouldUseAddressAccessSCEV (PtrOp))
3248+ break ;
3249+
3250+ Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this : getOperand (0 ));
3251+ Type *ScalarPtrTy = Ctx.Types .inferScalarType (PtrOp);
3252+ const Align Alignment = getLoadStoreAlignment (UI);
3253+ unsigned AS = getLoadStoreAddressSpace (UI);
3254+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3255+ InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3256+ UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo);
3257+
3258+ Type *PtrTy = isSingleScalar () ? ScalarPtrTy : toVectorTy (ScalarPtrTy, VF);
3259+
3260+ InstructionCost ScalarCost =
3261+ ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3262+ PtrTy, &Ctx.SE , nullptr , Ctx.CostKind );
3263+ if (isSingleScalar ())
3264+ return ScalarCost;
3265+
3266+ SmallVector<const VPValue *> OpsToScalarize;
3267+ Type *ResultTy = Type::getVoidTy (PtrTy->getContext ());
3268+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3269+ // don't assign scalarization overhead in general, if the target prefers
3270+ // vectorized addressing or the loaded value is used as part of an address
3271+ // of another load or store.
3272+ bool PreferVectorizedAddressing = Ctx.TTI .prefersVectorizedAddressing ();
3273+ if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress (this )) {
3274+ bool EfficientVectorLoadStore =
3275+ Ctx.TTI .supportsEfficientVectorElementLoadStore ();
3276+ if (!(IsLoad && !PreferVectorizedAddressing) &&
3277+ !(!IsLoad && EfficientVectorLoadStore))
3278+ append_range (OpsToScalarize, operands ());
3279+
3280+ if (!EfficientVectorLoadStore)
3281+ ResultTy = Ctx.Types .inferScalarType (this );
3282+ }
3283+
3284+ return (ScalarCost * VF.getFixedValue ()) +
3285+ Ctx.getScalarizationOverhead (ResultTy, OpsToScalarize, VF, true );
31943286 }
31953287 }
31963288
0 commit comments