diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f55f21c94a85a..934012b2e53f5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1534,6 +1534,14 @@ class TargetTransformInfo { Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const; + /// \returns The cost of propagating Type \p DataType through Basic Block / + /// function boundaries. If \p IsCallingConv is specified, then \p DataType is + /// associated with either a function argument or return. Otherwise, \p + /// DataType is used in either a GEP instruction, or spans across BasicBlocks + /// (this is relevant because SelectionDAG builder may, for example, scalarize + /// illegal vectors across blocks, which introduces extract/insert code). + InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const; + /// \returns The number of pieces into which the provided type must be /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; @@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept { virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost getDataFlowCost(Type *DataType, + bool IsCallingConv) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; virtual InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0; @@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { TTI::TargetCostKind CostKind) override { return Impl.getCallInstrCost(F, RetTy, Tys, CostKind); } + InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override { + return Impl.getDataFlowCost(DataType, IsCallingConv); + } unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7828bdc1f1f43..5a25a88c3eb46 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -772,6 +772,10 @@ class TargetTransformInfoImplBase { return 1; } + InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const { + return 0; + } + // Assume that we have a register of the right size for the type. unsigned getNumberOfParts(Type *Tp) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9f8d3ded9b3c1..c6a5c38a1b3fd 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return 10; } + InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) { + return 0; + } + unsigned getNumberOfParts(Type *Tp) { std::pair LT = getTypeLegalizationCost(Tp); return LT.first.isValid() ? *LT.first.getValue() : 0; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 7e721cbc87f3f..edef9afa747d6 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, return Cost; } +InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType, + bool IsCallingConv) const { + InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { return TTIImpl->getNumberOfParts(Tp); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 437e01c37c6b6..52e6d02772914 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { return !F || !ST->isSingleLaneExecution(*F); } +InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType, + bool IsCallingConv) { + if (IsCallingConv || isTypeLegal(DataType)) + return BaseT::getDataFlowCost(DataType, IsCallingConv); + + return getNumberOfParts(DataType); +} + unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector // registers. See getRegisterClassForType for the implementation. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index b423df17302ca..c195c860075eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase { InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); + InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv); + bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef Indices = {}) const; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ae0819c964bef..3a66dad1d10e6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9075,15 +9075,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); + InstructionCost CommonCost = 0; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) - return 0; + return CommonCost; if (isa(VL[0])) return InstructionCost::getInvalid(); - return processBuildVector( - E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); + return CommonCost + + processBuildVector( + E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); } - InstructionCost CommonCost = 0; SmallVector Mask; bool IsReverseOrder = isReverseOrder(E->ReorderIndices); if (!E->ReorderIndices.empty() && @@ -9222,6 +9223,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, OpTE->Scalars.size()); } + // Calculate the cost difference of propagating a vector vs series of + // scalars across blocks. This may be nonzero in the case of illegal + // vectors. + Type *ScalarTy = VL0->getType()->getScalarType(); + if (ScalarTy && isValidElementType(ScalarTy)) { + ScalarCost += TTI->getDataFlowCost(ScalarTy, + /*IsCallingConv=*/false) * + EntryVF; + CommonCost += TTI->getDataFlowCost( + FixedVectorType::get(ScalarTy, EntryVF), /*IsCallingConv=*/false); + } + return CommonCost - ScalarCost; } case Instruction::ExtractValue: @@ -10241,6 +10254,27 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; + + // Calculate the cost difference of propagating a vector vs series of scalars + // across blocks. This may be nonzero in the case of illegal vectors. + Instruction *VL0 = TE.getMainOp(); + if (VL0 && ((I + 1) < VectorizableTree.size())) { + Instruction *VL1 = VectorizableTree[I + 1]->getMainOp(); + if (VL1 && (VL0->getParent() != VL1->getParent())) { + Type *ScalarTy = VL0->getType()->getScalarType(); + if (ScalarTy && isValidElementType(ScalarTy)) { + InstructionCost ScalarDFlow = + TTI->getDataFlowCost(ScalarTy, + /*IsCallingConv=*/false) * + TE.getVectorFactor(); + InstructionCost VectorDFlow = TTI->getDataFlowCost( + FixedVectorType::get(ScalarTy, TE.getVectorFactor()), + /*IsCallingConv=*/false); + Cost += (VectorDFlow - ScalarDFlow); + } + } + } + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " << shortBundleName(TE.Scalars) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); @@ -10257,8 +10291,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && - !ExtractCostCalculated.insert(EU.Scalar).second) + !ExtractCostCalculated.insert(EU.Scalar).second) { continue; + } // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be @@ -10267,8 +10302,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { continue; // No extract cost for vector "scalar" - if (isa(EU.Scalar->getType())) + if (isa(EU.Scalar->getType())) { + // Account for any additional costs required by CallingConvention for the + // type. + if (isa_and_nonnull(EU.User)) + Cost += + TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv=*/true); continue; + } // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match.