@@ -4820,105 +4820,173 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4820
4820
}
4821
4821
}
4822
4822
}
4823
- auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4823
+ // Correctly identify compare the cost of loads + shuffles rather than
4824
+ // strided/masked gather loads. Returns true if vectorized + shuffles
4825
+ // representation is better than just gather.
4826
+ auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
4827
+ bool ProfitableGatherPointers) {
4828
+ // Compare masked gather cost and loads + insert subvector costs.
4829
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4830
+ auto [ScalarGEPCost, VectorGEPCost] =
4831
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
4832
+ Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
4833
+ // Estimate the cost of masked gather GEP. If not a splat, roughly
4834
+ // estimate as a buildvector, otherwise estimate as splat.
4835
+ APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
4836
+ VectorType *PtrVecTy =
4837
+ getWidenedType(PointerOps.front()->getType()->getScalarType(),
4838
+ VecTy->getNumElements());
4839
+ if (static_cast<unsigned>(count_if(
4840
+ PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
4841
+ any_of(PointerOps, [&](Value *V) {
4842
+ return getUnderlyingObject(V) !=
4843
+ getUnderlyingObject(PointerOps.front());
4844
+ }))
4845
+ VectorGEPCost += TTI.getScalarizationOverhead(
4846
+ PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
4847
+ else
4848
+ VectorGEPCost +=
4849
+ TTI.getScalarizationOverhead(
4850
+ PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
4851
+ /*Insert=*/true, /*Extract=*/false, CostKind) +
4852
+ ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, std::nullopt,
4853
+ CostKind);
4854
+ // The cost of scalar loads.
4855
+ InstructionCost ScalarLoadsCost =
4856
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
4857
+ [&](InstructionCost C, Value *V) {
4858
+ return C + TTI.getInstructionCost(
4859
+ cast<Instruction>(V), CostKind);
4860
+ }) +
4861
+ ScalarGEPCost;
4862
+ // The cost of masked gather.
4863
+ InstructionCost MaskedGatherCost =
4864
+ TTI.getGatherScatterOpCost(
4865
+ Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
4866
+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4867
+ (ProfitableGatherPointers ? 0 : VectorGEPCost);
4868
+ InstructionCost GatherCost =
4869
+ TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
4870
+ /*Extract=*/false, CostKind) +
4871
+ ScalarLoadsCost;
4872
+ // The list of loads is small or perform partial check already - directly
4873
+ // compare masked gather cost and gather cost.
4874
+ constexpr unsigned ListLimit = 4;
4875
+ if (!TryRecursiveCheck || VL.size() < ListLimit)
4876
+ return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
4824
4877
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4825
- unsigned MinVF = getMinVF(Sz);
4826
- unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF );
4827
- MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4828
- for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4829
- unsigned VectorizedCnt = 0;
4878
+ unsigned MinVF = getMinVF(2 * Sz);
4879
+ DemandedElts.clearAllBits( );
4880
+ // Iterate through possible vectorization factors and check if vectorized +
4881
+ // shuffles is better than just gather.
4882
+ for ( unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
4830
4883
SmallVector<LoadsState> States;
4831
- for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4832
- Cnt += VF, ++VectorizedCnt) {
4884
+ for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
4833
4885
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4834
4886
SmallVector<unsigned> Order;
4835
4887
SmallVector<Value *> PointerOps;
4836
4888
LoadsState LS =
4837
4889
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4838
4890
/*TryRecursiveCheck=*/false);
4839
4891
// Check that the sorted loads are consecutive.
4840
- if (LS == LoadsState::Gather)
4841
- break;
4892
+ if (LS == LoadsState::Gather) {
4893
+ DemandedElts.setBits(Cnt, Cnt + VF);
4894
+ continue;
4895
+ }
4842
4896
// If need the reorder - consider as high-cost masked gather for now.
4843
4897
if ((LS == LoadsState::Vectorize ||
4844
4898
LS == LoadsState::StridedVectorize) &&
4845
4899
!Order.empty() && !isReverseOrder(Order))
4846
4900
LS = LoadsState::ScatterVectorize;
4847
4901
States.push_back(LS);
4848
4902
}
4903
+ if (DemandedElts.isAllOnes())
4904
+ // All loads gathered - try smaller VF.
4905
+ continue;
4906
+ InstructionCost ScalarVFGEPCost = 0;
4849
4907
// Can be vectorized later as a serie of loads/insertelements.
4850
- if (VectorizedCnt == VL.size() / VF) {
4851
- // Compare masked gather cost and loads + insersubvector costs.
4852
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4853
- auto [ScalarGEPCost, VectorGEPCost] =
4854
- getGEPCosts(TTI, PointerOps, PointerOps.front(),
4855
- Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
4856
- InstructionCost MaskedGatherCost =
4857
- TTI.getGatherScatterOpCost(Instruction::Load, VecTy,
4858
- cast<LoadInst>(VL0)->getPointerOperand(),
4859
- /*VariableMask=*/false, CommonAlignment,
4860
- CostKind) +
4861
- VectorGEPCost - ScalarGEPCost;
4862
- InstructionCost VecLdCost = 0;
4863
- auto *SubVecTy = getWidenedType(ScalarTy, VF);
4864
- for (auto [I, LS] : enumerate(States)) {
4865
- auto *LI0 = cast<LoadInst>(VL[I * VF]);
4866
- switch (LS) {
4867
- case LoadsState::Vectorize: {
4868
- auto [ScalarGEPCost, VectorGEPCost] =
4869
- getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4870
- LI0->getPointerOperand(), Instruction::Load,
4871
- CostKind, ScalarTy, SubVecTy);
4872
- VecLdCost += TTI.getMemoryOpCost(
4873
- Instruction::Load, SubVecTy, LI0->getAlign(),
4874
- LI0->getPointerAddressSpace(), CostKind,
4875
- TTI::OperandValueInfo()) +
4876
- VectorGEPCost - ScalarGEPCost;
4877
- break;
4878
- }
4879
- case LoadsState::StridedVectorize: {
4880
- auto [ScalarGEPCost, VectorGEPCost] =
4881
- getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4882
- LI0->getPointerOperand(), Instruction::Load,
4883
- CostKind, ScalarTy, SubVecTy);
4884
- VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
4885
- LI0->getPointerOperand(),
4886
- /*VariableMask=*/false,
4887
- CommonAlignment, CostKind) +
4888
- VectorGEPCost - ScalarGEPCost;
4889
- break;
4890
- }
4891
- case LoadsState::ScatterVectorize: {
4892
- auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4893
- TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4894
- LI0->getPointerOperand(), Instruction::GetElementPtr, CostKind,
4895
- ScalarTy, SubVecTy);
4896
- VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
4897
- LI0->getPointerOperand(),
4898
- /*VariableMask=*/false,
4899
- CommonAlignment, CostKind) +
4900
- VectorGEPCost - ScalarGEPCost;
4901
- break;
4902
- }
4903
- case LoadsState::Gather:
4904
- llvm_unreachable(
4905
- "Expected only consecutive, strided or masked gather loads.");
4906
- }
4907
- SmallVector<int> ShuffleMask(VL.size());
4908
- for (int Idx : seq<int>(0, VL.size()))
4909
- ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4908
+ InstructionCost VecLdCost = 0;
4909
+ if (!DemandedElts.isZero()) {
4910
+ VecLdCost =
4911
+ TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
4912
+ /*Extract=*/false, CostKind) +
4913
+ ScalarGEPCost;
4914
+ for (unsigned Idx : seq<unsigned>(VL.size()))
4915
+ if (DemandedElts[Idx])
4916
+ VecLdCost +=
4917
+ TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
4918
+ }
4919
+ auto *SubVecTy = getWidenedType(ScalarTy, VF);
4920
+ for (auto [I, LS] : enumerate(States)) {
4921
+ auto *LI0 = cast<LoadInst>(VL[I * VF]);
4922
+ InstructionCost VectorGEPCost =
4923
+ (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
4924
+ ? 0
4925
+ : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4926
+ LI0->getPointerOperand(),
4927
+ Instruction::GetElementPtr, CostKind, ScalarTy,
4928
+ SubVecTy)
4929
+ .second;
4930
+ if (LS == LoadsState::ScatterVectorize) {
4931
+ if (static_cast<unsigned>(
4932
+ count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
4933
+ PointerOps.size() - 1 ||
4934
+ any_of(PointerOps, [&](Value *V) {
4935
+ return getUnderlyingObject(V) !=
4936
+ getUnderlyingObject(PointerOps.front());
4937
+ }))
4938
+ VectorGEPCost += TTI.getScalarizationOverhead(
4939
+ SubVecTy, APInt::getAllOnes(VF),
4940
+ /*Insert=*/true, /*Extract=*/false, CostKind);
4941
+ else
4942
+ VectorGEPCost += TTI.getScalarizationOverhead(
4943
+ SubVecTy, APInt::getOneBitSet(VF, 0),
4944
+ /*Insert=*/true, /*Extract=*/false, CostKind) +
4945
+ ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
4946
+ std::nullopt, CostKind);
4947
+ }
4948
+ switch (LS) {
4949
+ case LoadsState::Vectorize:
4950
+ VecLdCost +=
4951
+ TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
4952
+ LI0->getPointerAddressSpace(), CostKind,
4953
+ TTI::OperandValueInfo()) +
4954
+ VectorGEPCost;
4955
+ break;
4956
+ case LoadsState::StridedVectorize:
4957
+ VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
4958
+ LI0->getPointerOperand(),
4959
+ /*VariableMask=*/false,
4960
+ CommonAlignment, CostKind) +
4961
+ VectorGEPCost;
4962
+ break;
4963
+ case LoadsState::ScatterVectorize:
4964
+ VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
4965
+ LI0->getPointerOperand(),
4966
+ /*VariableMask=*/false,
4967
+ CommonAlignment, CostKind) +
4968
+ VectorGEPCost;
4969
+ break;
4970
+ case LoadsState::Gather:
4971
+ // Gathers are already calculated - ignore.
4972
+ continue;
4973
+ }
4974
+ SmallVector<int> ShuffleMask(VL.size());
4975
+ for (int Idx : seq<int>(0, VL.size()))
4976
+ ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4977
+ if (I > 0)
4910
4978
VecLdCost +=
4911
4979
::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4912
4980
CostKind, I * VF, SubVecTy);
4913
- }
4914
- // If masked gather cost is higher - better to vectorize, so
4915
- // consider it as a gather node. It will be better estimated
4916
- // later.
4917
- if (MaskedGatherCost >= VecLdCost)
4918
- return true;
4919
4981
}
4982
+ // If masked gather cost is higher - better to vectorize, so
4983
+ // consider it as a gather node. It will be better estimated
4984
+ // later.
4985
+ if (MaskedGatherCost >= VecLdCost &&
4986
+ VecLdCost - GatherCost < -SLPCostThreshold)
4987
+ return true;
4920
4988
}
4921
- return false ;
4989
+ return MaskedGatherCost - GatherCost >= -SLPCostThreshold ;
4922
4990
};
4923
4991
// TODO: need to improve analysis of the pointers, if not all of them are
4924
4992
// GEPs or have > 2 operands, we end up with a gather node, which just
@@ -4939,7 +5007,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4939
5007
!TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4940
5008
// Check if potential masked gather can be represented as series
4941
5009
// of loads + insertsubvectors.
4942
- if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
5010
+ if (TryRecursiveCheck &&
5011
+ CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) {
4943
5012
// If masked gather cost is higher - better to vectorize, so
4944
5013
// consider it as a gather node. It will be better estimated
4945
5014
// later.
0 commit comments