@@ -4326,6 +4326,11 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
43264326 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
43274327}
43284328
4329+ static std::pair<InstructionCost, InstructionCost>
4330+ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4331+ Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4332+ Type *ScalarTy, VectorType *VecTy);
4333+
43294334BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
43304335 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
43314336 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
@@ -4464,31 +4469,56 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
44644469 if (VectorizedCnt == VL.size() / VF) {
44654470 // Compare masked gather cost and loads + insersubvector costs.
44664471 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4467- InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4468- Instruction::Load, VecTy,
4469- cast<LoadInst>(VL0)->getPointerOperand(),
4470- /*VariableMask=*/false, CommonAlignment, CostKind);
4472+ auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4473+ TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4474+ CostKind, ScalarTy, VecTy);
4475+ InstructionCost MaskedGatherCost =
4476+ TTI.getGatherScatterOpCost(
4477+ Instruction::Load, VecTy,
4478+ cast<LoadInst>(VL0)->getPointerOperand(),
4479+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4480+ VectorGEPCost - ScalarGEPCost;
44714481 InstructionCost VecLdCost = 0;
44724482 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
44734483 for (auto [I, LS] : enumerate(States)) {
44744484 auto *LI0 = cast<LoadInst>(VL[I * VF]);
44754485 switch (LS) {
4476- case LoadsState::Vectorize:
4486+ case LoadsState::Vectorize: {
4487+ auto [ScalarGEPCost, VectorGEPCost] =
4488+ getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4489+ LI0->getPointerOperand(), Instruction::Load,
4490+ CostKind, ScalarTy, SubVecTy);
44774491 VecLdCost += TTI.getMemoryOpCost(
4478- Instruction::Load, SubVecTy, LI0->getAlign(),
4479- LI0->getPointerAddressSpace(), CostKind,
4480- TTI::OperandValueInfo());
4492+ Instruction::Load, SubVecTy, LI0->getAlign(),
4493+ LI0->getPointerAddressSpace(), CostKind,
4494+ TTI::OperandValueInfo()) +
4495+ VectorGEPCost - ScalarGEPCost;
44814496 break;
4482- case LoadsState::StridedVectorize:
4483- VecLdCost += TTI.getStridedMemoryOpCost(
4484- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4485- /*VariableMask=*/false, CommonAlignment, CostKind);
4497+ }
4498+ case LoadsState::StridedVectorize: {
4499+ auto [ScalarGEPCost, VectorGEPCost] =
4500+ getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4501+ LI0->getPointerOperand(), Instruction::Load,
4502+ CostKind, ScalarTy, SubVecTy);
4503+ VecLdCost +=
4504+ TTI.getStridedMemoryOpCost(
4505+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4507+ VectorGEPCost - ScalarGEPCost;
44864508 break;
4487- case LoadsState::ScatterVectorize:
4488- VecLdCost += TTI.getGatherScatterOpCost(
4489- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4490- /*VariableMask=*/false, CommonAlignment, CostKind);
4509+ }
4510+ case LoadsState::ScatterVectorize: {
4511+ auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4512+ TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4513+ LI0->getPointerOperand(), Instruction::GetElementPtr,
4514+ CostKind, ScalarTy, SubVecTy);
4515+ VecLdCost +=
4516+ TTI.getGatherScatterOpCost(
4517+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4519+ VectorGEPCost - ScalarGEPCost;
44914520 break;
4521+ }
44924522 case LoadsState::Gather:
44934523 llvm_unreachable(
44944524 "Expected only consecutive, strided or masked gather loads.");
@@ -4497,13 +4527,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
44974527 for (int Idx : seq<int>(0, VL.size()))
44984528 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
44994529 VecLdCost +=
4500- TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4501- ShuffleMask, CostKind, I * VF, SubVecTy);
4530+ TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask ,
4531+ CostKind, I * VF, SubVecTy);
45024532 }
45034533 // If masked gather cost is higher - better to vectorize, so
45044534 // consider it as a gather node. It will be better estimated
45054535 // later.
4506- if (MaskedGatherCost > VecLdCost)
4536+ if (MaskedGatherCost >= VecLdCost)
45074537 return true;
45084538 }
45094539 }
@@ -7951,7 +7981,13 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
79517981
79527982 ScalarCost =
79537983 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7954- if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7984+ auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7985+ if (!BaseGEP) {
7986+ auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7987+ if (It != Ptrs.end())
7988+ BaseGEP = cast<GEPOperator>(*It);
7989+ }
7990+ if (BaseGEP) {
79557991 SmallVector<const Value *> Indices(BaseGEP->indices());
79567992 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
79577993 BaseGEP->getPointerOperand(), Indices, VecTy,
0 commit comments