@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
3069
3069
});
3070
3070
}
3071
3071
3072
+ // / Returns true if \p Ptr is a pointer computation for which the legacy cost
3073
+ // / model computes a SCEV expression when computing the address cost.
3074
+ static bool shouldUseAddressAccessSCEV (const VPValue *Ptr) {
3075
+ auto *PtrR = Ptr->getDefiningRecipe ();
3076
+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3077
+ cast<VPReplicateRecipe>(PtrR)->getOpcode () ==
3078
+ Instruction::GetElementPtr) ||
3079
+ isa<VPWidenGEPRecipe>(PtrR)))
3080
+ return false ;
3081
+
3082
+ // We are looking for a GEP where all indices are either loop invariant or
3083
+ // inductions.
3084
+ for (VPValue *Opd : drop_begin (PtrR->operands ())) {
3085
+ if (!Opd->isDefinedOutsideLoopRegions () &&
3086
+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3087
+ return false ;
3088
+ }
3089
+
3090
+ return true ;
3091
+ }
3092
+
3093
+ // / Returns true if \p V is used as part of the address of another load or
3094
+ // / store.
3095
+ static bool isUsedByLoadStoreAddress (const VPUser *V) {
3096
+ SmallPtrSet<const VPUser *, 4 > Seen;
3097
+ SmallVector<const VPUser *> WorkList = {V};
3098
+
3099
+ while (!WorkList.empty ()) {
3100
+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val ());
3101
+ if (!Cur || !Seen.insert (Cur).second )
3102
+ continue ;
3103
+
3104
+ for (VPUser *U : Cur->users ()) {
3105
+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3106
+ if (InterleaveR->getAddr () == Cur)
3107
+ return true ;
3108
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3109
+ if (RepR->getOpcode () == Instruction::Load &&
3110
+ RepR->getOperand (0 ) == Cur)
3111
+ return true ;
3112
+ if (RepR->getOpcode () == Instruction::Store &&
3113
+ RepR->getOperand (1 ) == Cur)
3114
+ return true ;
3115
+ }
3116
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3117
+ if (MemR->getAddr () == Cur && MemR->isConsecutive ())
3118
+ return true ;
3119
+ }
3120
+ }
3121
+
3122
+ append_range (WorkList, cast<VPSingleDefRecipe>(Cur)->users ());
3123
+ }
3124
+ return false ;
3125
+ }
3126
+
3072
3127
InstructionCost VPReplicateRecipe::computeCost (ElementCount VF,
3073
3128
VPCostContext &Ctx) const {
3074
3129
Instruction *UI = cast<Instruction>(getUnderlyingValue ());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3176
3231
}
3177
3232
case Instruction::Load:
3178
3233
case Instruction::Store: {
3179
- if (isSingleScalar ()) {
3180
- bool IsLoad = UI->getOpcode () == Instruction::Load;
3181
- Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this : getOperand (0 ));
3182
- Type *ScalarPtrTy = Ctx.Types .inferScalarType (getOperand (IsLoad ? 0 : 1 ));
3183
- const Align Alignment = getLoadStoreAlignment (UI);
3184
- unsigned AS = getLoadStoreAddressSpace (UI);
3185
- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3186
- InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3187
- UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo, UI);
3188
- return ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3189
- ScalarPtrTy, nullptr , nullptr , Ctx.CostKind );
3190
- }
3234
+ if (VF.isScalable () && !isSingleScalar ())
3235
+ return InstructionCost::getInvalid ();
3236
+
3191
3237
// TODO: See getMemInstScalarizationCost for how to handle replicating and
3192
3238
// predicated cases.
3193
- break ;
3239
+ const VPRegionBlock *ParentRegion = getParent ()->getParent ();
3240
+ if (ParentRegion && ParentRegion->isReplicator ())
3241
+ break ;
3242
+
3243
+ bool IsLoad = UI->getOpcode () == Instruction::Load;
3244
+ const VPValue *PtrOp = getOperand (!IsLoad);
3245
+ // TODO: Handle cases where we need to pass a SCEV to
3246
+ // getAddressComputationCost.
3247
+ if (shouldUseAddressAccessSCEV (PtrOp))
3248
+ break ;
3249
+
3250
+ Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this : getOperand (0 ));
3251
+ Type *ScalarPtrTy = Ctx.Types .inferScalarType (PtrOp);
3252
+ const Align Alignment = getLoadStoreAlignment (UI);
3253
+ unsigned AS = getLoadStoreAddressSpace (UI);
3254
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3255
+ InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3256
+ UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo);
3257
+
3258
+ Type *PtrTy = isSingleScalar () ? ScalarPtrTy : toVectorTy (ScalarPtrTy, VF);
3259
+
3260
+ InstructionCost ScalarCost =
3261
+ ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3262
+ PtrTy, &Ctx.SE , nullptr , Ctx.CostKind );
3263
+ if (isSingleScalar ())
3264
+ return ScalarCost;
3265
+
3266
+ SmallVector<const VPValue *> OpsToScalarize;
3267
+ Type *ResultTy = Type::getVoidTy (PtrTy->getContext ());
3268
+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3269
+ // don't assign scalarization overhead in general, if the target prefers
3270
+ // vectorized addressing or the loaded value is used as part of an address
3271
+ // of another load or store.
3272
+ bool PreferVectorizedAddressing = Ctx.TTI .prefersVectorizedAddressing ();
3273
+ if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress (this )) {
3274
+ bool EfficientVectorLoadStore =
3275
+ Ctx.TTI .supportsEfficientVectorElementLoadStore ();
3276
+ if (!(IsLoad && !PreferVectorizedAddressing) &&
3277
+ !(!IsLoad && EfficientVectorLoadStore))
3278
+ append_range (OpsToScalarize, operands ());
3279
+
3280
+ if (!EfficientVectorLoadStore)
3281
+ ResultTy = Ctx.Types .inferScalarType (this );
3282
+ }
3283
+
3284
+ return (ScalarCost * VF.getFixedValue ()) +
3285
+ Ctx.getScalarizationOverhead (ResultTy, OpsToScalarize, VF, true );
3194
3286
}
3195
3287
}
3196
3288
0 commit comments