@@ -5491,12 +5491,16 @@ static bool isMaskedLoadCompress(
54915491 const unsigned Sz = VL.size();
54925492 auto *VecTy = getWidenedType(ScalarTy, Sz);
54935493 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5494+ SmallVector<int> Mask;
5495+ if (!Order.empty())
5496+ inversePermutation(Order, Mask);
54945497 // Check external uses.
54955498 for (const auto [I, V] : enumerate(VL)) {
54965499 if (AreAllUsersVectorized(V))
54975500 continue;
54985501 InstructionCost ExtractCost =
5499- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
5502+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
5503+ Mask.empty() ? I : Mask[I]);
55005504 InstructionCost ScalarCost =
55015505 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
55025506 if (ExtractCost <= ScalarCost)
@@ -5536,8 +5540,11 @@ static bool isMaskedLoadCompress(
55365540 bool IsStrided =
55375541 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
55385542 assert(CompressMask.size() >= 2 && "At least two elements are required");
5543+ SmallVector<Value *> OrderedPointerOps(PointerOps);
5544+ if (!Order.empty())
5545+ reorderScalars(OrderedPointerOps, Mask);
55395546 auto [ScalarGEPCost, VectorGEPCost] =
5540- getGEPCosts(TTI, PointerOps, PointerOps .front(),
5547+ getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps .front(),
55415548 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
55425549 // The cost of scalar loads.
55435550 InstructionCost ScalarLoadsCost =
@@ -5564,17 +5571,16 @@ static bool isMaskedLoadCompress(
55645571 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
55655572 LI->getPointerAddressSpace(), CostKind);
55665573 }
5567- SmallVector<int> Mask;
5568- if (!Order.empty())
5569- inversePermutation(Order, Mask);
55705574 if (IsStrided) {
55715575 // Check for potential segmented(interleaved) loads.
55725576 if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
55735577 CommonAlignment,
55745578 LI->getPointerAddressSpace())) {
5575- InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
5576- Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt,
5577- CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked);
5579+ InstructionCost InterleavedCost =
5580+ VectorGEPCost + TTI.getInterleavedMemoryOpCost(
5581+ Instruction::Load, LoadVecTy, CompressMask[1],
5582+ std::nullopt, CommonAlignment,
5583+ LI->getPointerAddressSpace(), CostKind, IsMasked);
55785584 if (!Mask.empty())
55795585 InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
55805586 VecTy, Mask, CostKind);
0 commit comments