diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 89a85bc8a9086..7455e2a24460b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1744,6 +1744,21 @@ class TargetTransformInfo { bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const; + /// Return true if sinking I's operands to the same basic block as I is + /// profitable, e.g. because the operands can be folded into a target + /// instruction during instruction selection. After calling the function + /// \p Ops contains the Uses to sink ordered by dominance (dominating users + /// come first). + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + /// Return true if it's significantly cheaper to shift a vector by a uniform + /// scalar than by an amount which will vary across each lane. On x86 before + /// AVX2 for example, there is a "psllw" instruction for the former case, but + /// no simple instruction for a general "a << b" operation on vectors. + /// This should also apply to lowering for vector funnel shifts (rotates). + bool isVectorShiftByScalarCheap(Type *Ty) const; + struct VPLegalization { enum VPTransform { // keep the predicating parameter @@ -2182,6 +2197,11 @@ class TargetTransformInfo::Concept { virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const = 0; + virtual bool + isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &OpsToSink) const = 0; + + virtual bool isVectorShiftByScalarCheap(Type *Ty) const = 0; virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; @@ -2952,6 +2972,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasActiveVectorLength(Opcode, DataType, Alignment); } + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override { + return Impl.isProfitableToSinkOperands(I, Ops); + }; + + bool isVectorShiftByScalarCheap(Type *Ty) const override { + return Impl.isVectorShiftByScalarCheap(Ty); + } + VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const override { return Impl.getVPLegalizationStrategy(PI); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index eca8818cc25e6..c3ae0f9b255f1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -968,6 +968,13 @@ class TargetTransformInfoImplBase { return false; } + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + return false; + } + + bool isVectorShiftByScalarCheap(Type *Ty) const { return false; } + TargetTransformInfo::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const { return TargetTransformInfo::VPLegalization( diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3842af56e6b3d..97f4345493b0b 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2860,15 +2860,6 @@ class TargetLoweringBase { return Value == 0; } - /// Return true if it's significantly cheaper to shift a vector by a uniform - /// scalar than by an amount which will vary across each lane. On x86 before - /// AVX2 for example, there is a "psllw" instruction for the former case, but - /// no simple instruction for a general "a << b" operation on vectors. - /// This should also apply to lowering for vector funnel shifts (rotates). - virtual bool isVectorShiftByScalarCheap(Type *Ty) const { - return false; - } - /// Given a shuffle vector SVI representing a vector splat, return a new /// scalar type of size equal to SVI's scalar type if the new type is more /// profitable. Returns nullptr otherwise. For example under MVE float splats @@ -3085,16 +3076,6 @@ class TargetLoweringBase { /// a larger type. virtual bool signExtendConstant(const ConstantInt *C) const { return false; } - /// Return true if sinking I's operands to the same basic block as I is - /// profitable, e.g. because the operands can be folded into a target - /// instruction during instruction selection. After calling the function - /// \p Ops contains the Uses to sink ordered by dominance (dominating users - /// come first). - virtual bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - return false; - } - /// Try to optimize extending or truncating conversion instructions (like /// zext, trunc, fptoui, uitofp) for the target. virtual bool diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b5195f764cbd1..54fef785073b1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1354,6 +1354,15 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +bool TargetTransformInfo::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &OpsToSink) const { + return TTIImpl->isProfitableToSinkOperands(I, OpsToSink); +} + +bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const { + return TTIImpl->isVectorShiftByScalarCheap(Ty); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 631cc26d6022f..3e09fbad6ab19 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7274,7 +7274,7 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) { // We can't do this effectively in SDAG because we may not be able to // determine if the select operands are splats from within a basic block. Type *Ty = Shift->getType(); - if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) + if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty)) return false; Value *Cond, *TVal, *FVal; if (!match(Shift->getOperand(1), @@ -7309,7 +7309,7 @@ bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) { // We can't do this effectively in SDAG because we may not be able to // determine if the select operands are splats from within a basic block. Type *Ty = Fsh->getType(); - if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) + if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty)) return false; Value *Cond, *TVal, *FVal; if (!match(Fsh->getOperand(2), @@ -7566,7 +7566,7 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // If the operands of I can be folded into a target instruction together with // I, duplicate and sink them. SmallVector OpsToSink; - if (!TLI->shouldSinkOperands(I, OpsToSink)) + if (!TTI->isProfitableToSinkOperands(I, OpsToSink)) return false; // OpsToSink can contain multiple uses in a use chain (e.g. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4166d9bd22bc0..a46848c37375e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16246,422 +16246,6 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { return true; } -static bool isSplatShuffle(Value *V) { - if (auto *Shuf = dyn_cast(V)) - return all_equal(Shuf->getShuffleMask()); - return false; -} - -/// Check if both Op1 and Op2 are shufflevector extracts of either the lower -/// or upper half of the vector elements. -static bool areExtractShuffleVectors(Value *Op1, Value *Op2, - bool AllowSplat = false) { - // Scalable types can't be extract shuffle vectors. - if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) - return false; - - auto areTypesHalfed = [](Value *FullV, Value *HalfV) { - auto *FullTy = FullV->getType(); - auto *HalfTy = HalfV->getType(); - return FullTy->getPrimitiveSizeInBits().getFixedValue() == - 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); - }; - - auto extractHalf = [](Value *FullV, Value *HalfV) { - auto *FullVT = cast(FullV->getType()); - auto *HalfVT = cast(HalfV->getType()); - return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); - }; - - ArrayRef M1, M2; - Value *S1Op1 = nullptr, *S2Op1 = nullptr; - if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || - !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) - return false; - - // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that - // it is not checked as an extract below. - if (AllowSplat && isSplatShuffle(Op1)) - S1Op1 = nullptr; - if (AllowSplat && isSplatShuffle(Op2)) - S2Op1 = nullptr; - - // Check that the operands are half as wide as the result and we extract - // half of the elements of the input vectors. - if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || - (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) - return false; - - // Check the mask extracts either the lower or upper half of vector - // elements. - int M1Start = 0; - int M2Start = 0; - int NumElements = cast(Op1->getType())->getNumElements() * 2; - if ((S1Op1 && - !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || - (S2Op1 && - !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) - return false; - - if ((M1Start != 0 && M1Start != (NumElements / 2)) || - (M2Start != 0 && M2Start != (NumElements / 2))) - return false; - if (S1Op1 && S2Op1 && M1Start != M2Start) - return false; - - return true; -} - -/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth -/// of the vector elements. -static bool areExtractExts(Value *Ext1, Value *Ext2) { - auto areExtDoubled = [](Instruction *Ext) { - return Ext->getType()->getScalarSizeInBits() == - 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); - }; - - if (!match(Ext1, m_ZExtOrSExt(m_Value())) || - !match(Ext2, m_ZExtOrSExt(m_Value())) || - !areExtDoubled(cast(Ext1)) || - !areExtDoubled(cast(Ext2))) - return false; - - return true; -} - -/// Check if Op could be used with vmull_high_p64 intrinsic. -static bool isOperandOfVmullHighP64(Value *Op) { - Value *VectorOperand = nullptr; - ConstantInt *ElementIndex = nullptr; - return match(Op, m_ExtractElt(m_Value(VectorOperand), - m_ConstantInt(ElementIndex))) && - ElementIndex->getValue() == 1 && - isa(VectorOperand->getType()) && - cast(VectorOperand->getType())->getNumElements() == 2; -} - -/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. -static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { - return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); -} - -static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl &Ops) { - // Restrict ourselves to the form CodeGenPrepare typically constructs. - auto *GEP = dyn_cast(Ptrs); - if (!GEP || GEP->getNumOperands() != 2) - return false; - - Value *Base = GEP->getOperand(0); - Value *Offsets = GEP->getOperand(1); - - // We only care about scalar_base+vector_offsets. - if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) - return false; - - // Sink extends that would allow us to use 32-bit offset vectors. - if (isa(Offsets) || isa(Offsets)) { - auto *OffsetsInst = cast(Offsets); - if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && - OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) - Ops.push_back(&GEP->getOperandUse(1)); - } - - // Sink the GEP. - return true; -} - -/// We want to sink following cases: -/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; -/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); -static bool shouldSinkVScale(Value *Op, SmallVectorImpl &Ops) { - if (match(Op, m_VScale())) - return true; - if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || - match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { - Ops.push_back(&cast(Op)->getOperandUse(0)); - return true; - } - if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || - match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { - Value *ZExtOp = cast(Op)->getOperand(0); - Ops.push_back(&cast(ZExtOp)->getOperandUse(0)); - Ops.push_back(&cast(Op)->getOperandUse(0)); - return true; - } - return false; -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). -bool AArch64TargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - if (IntrinsicInst *II = dyn_cast(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::aarch64_neon_smull: - case Intrinsic::aarch64_neon_umull: - if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), - /*AllowSplat=*/true)) { - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; - } - [[fallthrough]]; - - case Intrinsic::fma: - case Intrinsic::fmuladd: - if (isa(I->getType()) && - cast(I->getType())->getElementType()->isHalfTy() && - !Subtarget->hasFullFP16()) - return false; - [[fallthrough]]; - case Intrinsic::aarch64_neon_sqdmull: - case Intrinsic::aarch64_neon_sqdmulh: - case Intrinsic::aarch64_neon_sqrdmulh: - // Sink splats for index lane variants - if (isSplatShuffle(II->getOperand(0))) - Ops.push_back(&II->getOperandUse(0)); - if (isSplatShuffle(II->getOperand(1))) - Ops.push_back(&II->getOperandUse(1)); - return !Ops.empty(); - case Intrinsic::aarch64_neon_fmlal: - case Intrinsic::aarch64_neon_fmlal2: - case Intrinsic::aarch64_neon_fmlsl: - case Intrinsic::aarch64_neon_fmlsl2: - // Sink splats for index lane variants - if (isSplatShuffle(II->getOperand(1))) - Ops.push_back(&II->getOperandUse(1)); - if (isSplatShuffle(II->getOperand(2))) - Ops.push_back(&II->getOperandUse(2)); - return !Ops.empty(); - case Intrinsic::aarch64_sve_ptest_first: - case Intrinsic::aarch64_sve_ptest_last: - if (auto *IIOp = dyn_cast(II->getOperand(0))) - if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) - Ops.push_back(&II->getOperandUse(0)); - return !Ops.empty(); - case Intrinsic::aarch64_sme_write_horiz: - case Intrinsic::aarch64_sme_write_vert: - case Intrinsic::aarch64_sme_writeq_horiz: - case Intrinsic::aarch64_sme_writeq_vert: { - auto *Idx = dyn_cast(II->getOperand(1)); - if (!Idx || Idx->getOpcode() != Instruction::Add) - return false; - Ops.push_back(&II->getOperandUse(1)); - return true; - } - case Intrinsic::aarch64_sme_read_horiz: - case Intrinsic::aarch64_sme_read_vert: - case Intrinsic::aarch64_sme_readq_horiz: - case Intrinsic::aarch64_sme_readq_vert: - case Intrinsic::aarch64_sme_ld1b_vert: - case Intrinsic::aarch64_sme_ld1h_vert: - case Intrinsic::aarch64_sme_ld1w_vert: - case Intrinsic::aarch64_sme_ld1d_vert: - case Intrinsic::aarch64_sme_ld1q_vert: - case Intrinsic::aarch64_sme_st1b_vert: - case Intrinsic::aarch64_sme_st1h_vert: - case Intrinsic::aarch64_sme_st1w_vert: - case Intrinsic::aarch64_sme_st1d_vert: - case Intrinsic::aarch64_sme_st1q_vert: - case Intrinsic::aarch64_sme_ld1b_horiz: - case Intrinsic::aarch64_sme_ld1h_horiz: - case Intrinsic::aarch64_sme_ld1w_horiz: - case Intrinsic::aarch64_sme_ld1d_horiz: - case Intrinsic::aarch64_sme_ld1q_horiz: - case Intrinsic::aarch64_sme_st1b_horiz: - case Intrinsic::aarch64_sme_st1h_horiz: - case Intrinsic::aarch64_sme_st1w_horiz: - case Intrinsic::aarch64_sme_st1d_horiz: - case Intrinsic::aarch64_sme_st1q_horiz: { - auto *Idx = dyn_cast(II->getOperand(3)); - if (!Idx || Idx->getOpcode() != Instruction::Add) - return false; - Ops.push_back(&II->getOperandUse(3)); - return true; - } - case Intrinsic::aarch64_neon_pmull: - if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) - return false; - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; - case Intrinsic::aarch64_neon_pmull64: - if (!areOperandsOfVmullHighP64(II->getArgOperand(0), - II->getArgOperand(1))) - return false; - Ops.push_back(&II->getArgOperandUse(0)); - Ops.push_back(&II->getArgOperandUse(1)); - return true; - case Intrinsic::masked_gather: - if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) - return false; - Ops.push_back(&II->getArgOperandUse(0)); - return true; - case Intrinsic::masked_scatter: - if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) - return false; - Ops.push_back(&II->getArgOperandUse(1)); - return true; - default: - return false; - } - } - - // Sink vscales closer to uses for better isel - switch (I->getOpcode()) { - case Instruction::GetElementPtr: - case Instruction::Add: - case Instruction::Sub: - for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { - if (shouldSinkVScale(I->getOperand(Op), Ops)) { - Ops.push_back(&I->getOperandUse(Op)); - return true; - } - } - break; - default: - break; - } - - if (!I->getType()->isVectorTy()) - return false; - - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) - return false; - - // If the exts' operands extract either the lower or upper elements, we - // can sink them too. - auto Ext1 = cast(I->getOperand(0)); - auto Ext2 = cast(I->getOperand(1)); - if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { - Ops.push_back(&Ext1->getOperandUse(0)); - Ops.push_back(&Ext2->getOperandUse(0)); - } - - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - - return true; - } - case Instruction::Or: { - // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> - // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) - if (Subtarget->hasNEON()) { - Instruction *OtherAnd, *IA, *IB; - Value *MaskValue; - // MainAnd refers to And instruction that has 'Not' as one of its operands - if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), - m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), - m_Instruction(IA)))))) { - if (match(OtherAnd, - m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { - Instruction *MainAnd = I->getOperand(0) == OtherAnd - ? cast(I->getOperand(1)) - : cast(I->getOperand(0)); - - // Both Ands should be in same basic block as Or - if (I->getParent() != MainAnd->getParent() || - I->getParent() != OtherAnd->getParent()) - return false; - - // Non-mask operands of both Ands should also be in same basic block - if (I->getParent() != IA->getParent() || - I->getParent() != IB->getParent()) - return false; - - Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - - return true; - } - } - } - - return false; - } - case Instruction::Mul: { - int NumZExts = 0, NumSExts = 0; - for (auto &Op : I->operands()) { - // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - if (match(&Op, m_SExt(m_Value()))) { - NumSExts++; - continue; - } else if (match(&Op, m_ZExt(m_Value()))) { - NumZExts++; - continue; - } - - ShuffleVectorInst *Shuffle = dyn_cast(Op); - - // If the Shuffle is a splat and the operand is a zext/sext, sinking the - // operand and the s/zext can help create indexed s/umull. This is - // especially useful to prevent i64 mul being scalarized. - if (Shuffle && isSplatShuffle(Shuffle) && - match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&Op); - if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) - NumSExts++; - else - NumZExts++; - continue; - } - - if (!Shuffle) - continue; - - Value *ShuffleOperand = Shuffle->getOperand(0); - InsertElementInst *Insert = dyn_cast(ShuffleOperand); - if (!Insert) - continue; - - Instruction *OperandInstr = dyn_cast(Insert->getOperand(1)); - if (!OperandInstr) - continue; - - ConstantInt *ElementConstant = - dyn_cast(Insert->getOperand(2)); - // Check that the insertelement is inserting into element 0 - if (!ElementConstant || !ElementConstant->isZero()) - continue; - - unsigned Opcode = OperandInstr->getOpcode(); - if (Opcode == Instruction::SExt) - NumSExts++; - else if (Opcode == Instruction::ZExt) - NumZExts++; - else { - // If we find that the top bits are known 0, then we can sink and allow - // the backend to generate a umull. - unsigned Bitwidth = I->getType()->getScalarSizeInBits(); - APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); - const DataLayout &DL = I->getDataLayout(); - if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) - continue; - NumZExts++; - } - - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&Op); - } - - // Is it profitable to sink if we found two of the same type of extends. - return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); - } - default: - return false; - } - return false; -} - static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl &Mask) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 480bf60360bf5..81b0f22f21e84 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -687,9 +687,6 @@ class AArch64TargetLowering : public TargetLowering { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; - bool optimizeExtendOrTruncateConversion( Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7a07bb67e77de..14274f8541522 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4671,3 +4671,420 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); } + +static bool isSplatShuffle(Value *V) { + if (auto *Shuf = dyn_cast(V)) + return all_equal(Shuf->getShuffleMask()); + return false; +} + +/// Check if both Op1 and Op2 are shufflevector extracts of either the lower +/// or upper half of the vector elements. +static bool areExtractShuffleVectors(Value *Op1, Value *Op2, + bool AllowSplat = false) { + // Scalable types can't be extract shuffle vectors. + if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) + return false; + + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { + auto *FullTy = FullV->getType(); + auto *HalfTy = HalfV->getType(); + return FullTy->getPrimitiveSizeInBits().getFixedValue() == + 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); + }; + + auto extractHalf = [](Value *FullV, Value *HalfV) { + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); + return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); + }; + + ArrayRef M1, M2; + Value *S1Op1 = nullptr, *S2Op1 = nullptr; + if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || + !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) + return false; + + // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that + // it is not checked as an extract below. + if (AllowSplat && isSplatShuffle(Op1)) + S1Op1 = nullptr; + if (AllowSplat && isSplatShuffle(Op2)) + S2Op1 = nullptr; + + // Check that the operands are half as wide as the result and we extract + // half of the elements of the input vectors. + if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || + (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) + return false; + + // Check the mask extracts either the lower or upper half of vector + // elements. + int M1Start = 0; + int M2Start = 0; + int NumElements = cast(Op1->getType())->getNumElements() * 2; + if ((S1Op1 && + !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || + (S2Op1 && + !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) + return false; + + if ((M1Start != 0 && M1Start != (NumElements / 2)) || + (M2Start != 0 && M2Start != (NumElements / 2))) + return false; + if (S1Op1 && S2Op1 && M1Start != M2Start) + return false; + + return true; +} + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if Op could be used with vmull_high_p64 intrinsic. +static bool isOperandOfVmullHighP64(Value *Op) { + Value *VectorOperand = nullptr; + ConstantInt *ElementIndex = nullptr; + return match(Op, m_ExtractElt(m_Value(VectorOperand), + m_ConstantInt(ElementIndex))) && + ElementIndex->getValue() == 1 && + isa(VectorOperand->getType()) && + cast(VectorOperand->getType())->getNumElements() == 2; +} + +/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. +static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { + return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); +} + +static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl &Ops) { + // Restrict ourselves to the form CodeGenPrepare typically constructs. + auto *GEP = dyn_cast(Ptrs); + if (!GEP || GEP->getNumOperands() != 2) + return false; + + Value *Base = GEP->getOperand(0); + Value *Offsets = GEP->getOperand(1); + + // We only care about scalar_base+vector_offsets. + if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) + return false; + + // Sink extends that would allow us to use 32-bit offset vectors. + if (isa(Offsets) || isa(Offsets)) { + auto *OffsetsInst = cast(Offsets); + if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && + OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) + Ops.push_back(&GEP->getOperandUse(1)); + } + + // Sink the GEP. + return true; +} + +/// We want to sink following cases: +/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; +/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); +static bool shouldSinkVScale(Value *Op, SmallVectorImpl &Ops) { + if (match(Op, m_VScale())) + return true; + if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || + match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + return true; + } + if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || + match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { + Value *ZExtOp = cast(Op)->getOperand(0); + Ops.push_back(&cast(ZExtOp)->getOperandUse(0)); + Ops.push_back(&cast(Op)->getOperandUse(0)); + return true; + } + return false; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). +bool AArch64TTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + if (IntrinsicInst *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), + /*AllowSplat=*/true)) { + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + } + [[fallthrough]]; + + case Intrinsic::fma: + case Intrinsic::fmuladd: + if (isa(I->getType()) && + cast(I->getType())->getElementType()->isHalfTy() && + !ST->hasFullFP16()) + return false; + [[fallthrough]]; + case Intrinsic::aarch64_neon_sqdmull: + case Intrinsic::aarch64_neon_sqdmulh: + case Intrinsic::aarch64_neon_sqrdmulh: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + return !Ops.empty(); + case Intrinsic::aarch64_neon_fmlal: + case Intrinsic::aarch64_neon_fmlal2: + case Intrinsic::aarch64_neon_fmlsl: + case Intrinsic::aarch64_neon_fmlsl2: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + if (isSplatShuffle(II->getOperand(2))) + Ops.push_back(&II->getOperandUse(2)); + return !Ops.empty(); + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + if (auto *IIOp = dyn_cast(II->getOperand(0))) + if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) + Ops.push_back(&II->getOperandUse(0)); + return !Ops.empty(); + case Intrinsic::aarch64_sme_write_horiz: + case Intrinsic::aarch64_sme_write_vert: + case Intrinsic::aarch64_sme_writeq_horiz: + case Intrinsic::aarch64_sme_writeq_vert: { + auto *Idx = dyn_cast(II->getOperand(1)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(1)); + return true; + } + case Intrinsic::aarch64_sme_read_horiz: + case Intrinsic::aarch64_sme_read_vert: + case Intrinsic::aarch64_sme_readq_horiz: + case Intrinsic::aarch64_sme_readq_vert: + case Intrinsic::aarch64_sme_ld1b_vert: + case Intrinsic::aarch64_sme_ld1h_vert: + case Intrinsic::aarch64_sme_ld1w_vert: + case Intrinsic::aarch64_sme_ld1d_vert: + case Intrinsic::aarch64_sme_ld1q_vert: + case Intrinsic::aarch64_sme_st1b_vert: + case Intrinsic::aarch64_sme_st1h_vert: + case Intrinsic::aarch64_sme_st1w_vert: + case Intrinsic::aarch64_sme_st1d_vert: + case Intrinsic::aarch64_sme_st1q_vert: + case Intrinsic::aarch64_sme_ld1b_horiz: + case Intrinsic::aarch64_sme_ld1h_horiz: + case Intrinsic::aarch64_sme_ld1w_horiz: + case Intrinsic::aarch64_sme_ld1d_horiz: + case Intrinsic::aarch64_sme_ld1q_horiz: + case Intrinsic::aarch64_sme_st1b_horiz: + case Intrinsic::aarch64_sme_st1h_horiz: + case Intrinsic::aarch64_sme_st1w_horiz: + case Intrinsic::aarch64_sme_st1d_horiz: + case Intrinsic::aarch64_sme_st1q_horiz: { + auto *Idx = dyn_cast(II->getOperand(3)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(3)); + return true; + } + case Intrinsic::aarch64_neon_pmull: + if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) + return false; + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + case Intrinsic::aarch64_neon_pmull64: + if (!areOperandsOfVmullHighP64(II->getArgOperand(0), + II->getArgOperand(1))) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; + case Intrinsic::masked_gather: + if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) + return false; + Ops.push_back(&II->getArgOperandUse(0)); + return true; + case Intrinsic::masked_scatter: + if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) + return false; + Ops.push_back(&II->getArgOperandUse(1)); + return true; + default: + return false; + } + } + + // Sink vscales closer to uses for better isel + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Sub: + for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { + if (shouldSinkVScale(I->getOperand(Op), Ops)) { + Ops.push_back(&I->getOperandUse(Op)); + return true; + } + } + break; + default: + break; + } + + if (!I->getType()->isVectorTy()) + return false; + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + + // If the exts' operands extract either the lower or upper elements, we + // can sink them too. + auto Ext1 = cast(I->getOperand(0)); + auto Ext2 = cast(I->getOperand(1)); + if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { + Ops.push_back(&Ext1->getOperandUse(0)); + Ops.push_back(&Ext2->getOperandUse(0)); + } + + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + case Instruction::Or: { + // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> + // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) + if (ST->hasNEON()) { + Instruction *OtherAnd, *IA, *IB; + Value *MaskValue; + // MainAnd refers to And instruction that has 'Not' as one of its operands + if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), + m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), + m_Instruction(IA)))))) { + if (match(OtherAnd, + m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { + Instruction *MainAnd = I->getOperand(0) == OtherAnd + ? cast(I->getOperand(1)) + : cast(I->getOperand(0)); + + // Both Ands should be in same basic block as Or + if (I->getParent() != MainAnd->getParent() || + I->getParent() != OtherAnd->getParent()) + return false; + + // Non-mask operands of both Ands should also be in same basic block + if (I->getParent() != IA->getParent() || + I->getParent() != IB->getParent()) + return false; + + Ops.push_back( + &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + } + } + + return false; + } + case Instruction::Mul: { + int NumZExts = 0, NumSExts = 0; + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + if (match(&Op, m_SExt(m_Value()))) { + NumSExts++; + continue; + } else if (match(&Op, m_ZExt(m_Value()))) { + NumZExts++; + continue; + } + + ShuffleVectorInst *Shuffle = dyn_cast(Op); + + // If the Shuffle is a splat and the operand is a zext/sext, sinking the + // operand and the s/zext can help create indexed s/umull. This is + // especially useful to prevent i64 mul being scalarized. + if (Shuffle && isSplatShuffle(Shuffle) && + match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&Op); + if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) + NumSExts++; + else + NumZExts++; + continue; + } + + if (!Shuffle) + continue; + + Value *ShuffleOperand = Shuffle->getOperand(0); + InsertElementInst *Insert = dyn_cast(ShuffleOperand); + if (!Insert) + continue; + + Instruction *OperandInstr = dyn_cast(Insert->getOperand(1)); + if (!OperandInstr) + continue; + + ConstantInt *ElementConstant = + dyn_cast(Insert->getOperand(2)); + // Check that the insertelement is inserting into element 0 + if (!ElementConstant || !ElementConstant->isZero()) + continue; + + unsigned Opcode = OperandInstr->getOpcode(); + if (Opcode == Instruction::SExt) + NumSExts++; + else if (Opcode == Instruction::ZExt) + NumZExts++; + else { + // If we find that the top bits are known 0, then we can sink and allow + // the backend to generate a umull. + unsigned Bitwidth = I->getType()->getScalarSizeInBits(); + APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); + const DataLayout &DL = I->getDataLayout(); + if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) + continue; + NumZExts++; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&Op); + } + + // Is it profitable to sink if we found two of the same type of extends. + return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); + } + default: + return false; + } + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 28e45207596ec..1d09d67f6ec9e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -416,7 +416,6 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const; - /// @} bool enableSelectOptimize() { return ST->enableSelectOptimize(); } @@ -435,6 +434,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + /// @} }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 94fdf4effa10a..61312efc53ad1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6043,22 +6043,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const { return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks } - -/// Whether it is profitable to sink the operands of an -/// Instruction I to the basic block of I. -/// This helps using several modifiers (like abs and neg) more often. -bool AMDGPUTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace PatternMatch; - - for (auto &Op : I->operands()) { - // Ensure we are not already sinking this operand. - if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) - continue; - - if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) - Ops.push_back(&Op); - } - - return !Ops.empty(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 5c2abd334276c..b2fd31cb2346e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -387,9 +387,6 @@ class AMDGPUTargetLowering : public TargetLowering { MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } - - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 4cf7733a260ff..0e051bf6e82fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1187,6 +1187,25 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); } +/// Whether it is profitable to sink the operands of an +/// Instruction I to the basic block of I. +/// This helps using several modifiers (like abs and neg) more often. +bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace PatternMatch; + + for (auto &Op : I->operands()) { + // Ensure we are not already sinking this operand. + if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) + continue; + + if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) + Ops.push_back(&Op); + } + + return !Ops.empty(); +} + bool GCNTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 76785ee456a41..30da002376251 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -237,6 +237,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { ArrayRef Args = {}, const Instruction *CxtI = nullptr); + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1733424a8b669..bf757edfa8589 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19283,149 +19283,6 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const { return false; } -/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth -/// of the vector elements. -static bool areExtractExts(Value *Ext1, Value *Ext2) { - auto areExtDoubled = [](Instruction *Ext) { - return Ext->getType()->getScalarSizeInBits() == - 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); - }; - - if (!match(Ext1, m_ZExtOrSExt(m_Value())) || - !match(Ext2, m_ZExtOrSExt(m_Value())) || - !areExtDoubled(cast(Ext1)) || - !areExtDoubled(cast(Ext2))) - return false; - - return true; -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// sext/zext can be folded into vsubl. -bool ARMTargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - if (!I->getType()->isVectorTy()) - return false; - - if (Subtarget->hasNEON()) { - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) - return false; - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - return true; - } - default: - return false; - } - } - - if (!Subtarget->hasMVEIntegerOps()) - return false; - - auto IsFMSMul = [&](Instruction *I) { - if (!I->hasOneUse()) - return false; - auto *Sub = cast(*I->users().begin()); - return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; - }; - auto IsFMS = [&](Instruction *I) { - if (match(I->getOperand(0), m_FNeg(m_Value())) || - match(I->getOperand(1), m_FNeg(m_Value()))) - return true; - return false; - }; - - auto IsSinker = [&](Instruction *I, int Operand) { - switch (I->getOpcode()) { - case Instruction::Add: - case Instruction::Mul: - case Instruction::FAdd: - case Instruction::ICmp: - case Instruction::FCmp: - return true; - case Instruction::FMul: - return !IsFMSMul(I); - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - return Operand == 1; - case Instruction::Call: - if (auto *II = dyn_cast(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::fma: - return !IsFMS(I); - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::arm_mve_add_predicated: - case Intrinsic::arm_mve_mul_predicated: - case Intrinsic::arm_mve_qadd_predicated: - case Intrinsic::arm_mve_vhadd: - case Intrinsic::arm_mve_hadd_predicated: - case Intrinsic::arm_mve_vqdmull: - case Intrinsic::arm_mve_vqdmull_predicated: - case Intrinsic::arm_mve_vqdmulh: - case Intrinsic::arm_mve_qdmulh_predicated: - case Intrinsic::arm_mve_vqrdmulh: - case Intrinsic::arm_mve_qrdmulh_predicated: - case Intrinsic::arm_mve_fma_predicated: - return true; - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::arm_mve_sub_predicated: - case Intrinsic::arm_mve_qsub_predicated: - case Intrinsic::arm_mve_hsub_predicated: - case Intrinsic::arm_mve_vhsub: - return Operand == 1; - default: - return false; - } - } - return false; - default: - return false; - } - }; - - for (auto OpIdx : enumerate(I->operands())) { - Instruction *Op = dyn_cast(OpIdx.value().get()); - // Make sure we are not already sinking this operand - if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - Instruction *Shuffle = Op; - if (Shuffle->getOpcode() == Instruction::BitCast) - Shuffle = dyn_cast(Shuffle->getOperand(0)); - // We are looking for a splat that can be sunk. - if (!Shuffle || - !match(Shuffle, m_Shuffle( - m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) - continue; - if (!IsSinker(I, OpIdx.index())) - continue; - - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Op->uses()) { - Instruction *Insn = cast(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; - } - - Ops.push_back(&Shuffle->getOperandUse(0)); - if (Shuffle != Op) - Ops.push_back(&Op->getOperandUse(0)); - Ops.push_back(&OpIdx.value()); - } - return true; -} - Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { if (!Subtarget->hasMVEIntegerOps()) return nullptr; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index a255e9b6fc365..316f7d3b9bce5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -460,8 +460,6 @@ class VectorType; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override; bool isFNegFree(EVT VT) const override; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 865e2f3066ef0..835ae98efb852 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2659,3 +2659,149 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const { return ST->hasARMOps(); } } + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + using namespace PatternMatch; + + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// sext/zext can be folded into vsubl. +bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace PatternMatch; + + if (!I->getType()->isVectorTy()) + return false; + + if (ST->hasNEON()) { + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: + return false; + } + } + + if (!ST->hasMVEIntegerOps()) + return false; + + auto IsFMSMul = [&](Instruction *I) { + if (!I->hasOneUse()) + return false; + auto *Sub = cast(*I->users().begin()); + return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; + }; + auto IsFMS = [&](Instruction *I) { + if (match(I->getOperand(0), m_FNeg(m_Value())) || + match(I->getOperand(1), m_FNeg(m_Value()))) + return true; + return false; + }; + + auto IsSinker = [&](Instruction *I, int Operand) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::FAdd: + case Instruction::ICmp: + case Instruction::FCmp: + return true; + case Instruction::FMul: + return !IsFMSMul(I); + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return Operand == 1; + case Instruction::Call: + if (auto *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + return !IsFMS(I); + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_vhadd: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmull: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_vqdmulh: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_vqrdmulh: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_fma_predicated: + return true; + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_hsub_predicated: + case Intrinsic::arm_mve_vhsub: + return Operand == 1; + default: + return false; + } + } + return false; + default: + return false; + } + }; + + for (auto OpIdx : enumerate(I->operands())) { + Instruction *Op = dyn_cast(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + Instruction *Shuffle = Op; + if (Shuffle->getOpcode() == Instruction::BitCast) + Shuffle = dyn_cast(Shuffle->getOperand(0)); + // We are looking for a splat that can be sunk. + if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), + m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + if (!IsSinker(I, OpIdx.index())) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + if (Shuffle != Op) + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); + } + return true; +} diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 7be53c4bcaa29..b0a75134ee02b 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -335,6 +335,8 @@ class ARMTTIImpl : public BasicTTIImplBase { bool hasArmWideBranch(bool Thumb) const; + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; /// @} }; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e77f8783f1727..ac491909c179c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2069,145 +2069,6 @@ bool RISCVTargetLowering:: return !XC; } -bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const { - switch (Opcode) { - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::ICmp: - case Instruction::FCmp: - return true; - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::Select: - return Operand == 1; - default: - return false; - } -} - - -bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const { - if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions()) - return false; - - if (canSplatOperand(I->getOpcode(), Operand)) - return true; - - auto *II = dyn_cast(I); - if (!II) - return false; - - switch (II->getIntrinsicID()) { - case Intrinsic::fma: - case Intrinsic::vp_fma: - return Operand == 0 || Operand == 1; - case Intrinsic::vp_shl: - case Intrinsic::vp_lshr: - case Intrinsic::vp_ashr: - case Intrinsic::vp_udiv: - case Intrinsic::vp_sdiv: - case Intrinsic::vp_urem: - case Intrinsic::vp_srem: - case Intrinsic::ssub_sat: - case Intrinsic::vp_ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::vp_usub_sat: - return Operand == 1; - // These intrinsics are commutative. - case Intrinsic::vp_add: - case Intrinsic::vp_mul: - case Intrinsic::vp_and: - case Intrinsic::vp_or: - case Intrinsic::vp_xor: - case Intrinsic::vp_fadd: - case Intrinsic::vp_fmul: - case Intrinsic::vp_icmp: - case Intrinsic::vp_fcmp: - case Intrinsic::smin: - case Intrinsic::vp_smin: - case Intrinsic::umin: - case Intrinsic::vp_umin: - case Intrinsic::smax: - case Intrinsic::vp_smax: - case Intrinsic::umax: - case Intrinsic::vp_umax: - case Intrinsic::sadd_sat: - case Intrinsic::vp_sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::vp_uadd_sat: - // These intrinsics have 'vr' versions. - case Intrinsic::vp_sub: - case Intrinsic::vp_fsub: - case Intrinsic::vp_fdiv: - return Operand == 0 || Operand == 1; - default: - return false; - } -} - -/// Check if sinking \p I's operands to I's basic block is profitable, because -/// the operands can be folded into a target instruction, e.g. -/// splats of scalars can fold into vector instructions. -bool RISCVTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions()) - return false; - - // Don't sink splat operands if the target prefers it. Some targets requires - // S2V transfer buffers and we can run out of them copying the same value - // repeatedly. - // FIXME: It could still be worth doing if it would improve vector register - // pressure and prevent a vector spill. - if (!Subtarget.sinkSplatOperands()) - return false; - - for (auto OpIdx : enumerate(I->operands())) { - if (!canSplatOperand(I, OpIdx.index())) - continue; - - Instruction *Op = dyn_cast(OpIdx.value().get()); - // Make sure we are not already sinking this operand - if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - // We are looking for a splat that can be sunk. - if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) - continue; - - // Don't sink i1 splats. - if (cast(Op->getType())->getElementType()->isIntegerTy(1)) - continue; - - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Op->uses()) { - Instruction *Insn = cast(U.getUser()); - if (!canSplatOperand(Insn, U.getOperandNo())) - return false; - } - - Ops.push_back(&Op->getOperandUse(0)); - Ops.push_back(&OpIdx.value()); - } - return true; -} - bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 05581552ab604..3864d58a129e9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -520,14 +520,6 @@ class RISCVTargetLowering : public TargetLowering { SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override; - /// Return true if the (vector) instruction I will be lowered to an instruction - /// with a scalar splat operand for the given Operand number. - bool canSplatOperand(Instruction *I, int Operand) const; - /// Return true if a vector instruction will lower to a target instruction - /// able to splat the given operand. - bool canSplatOperand(unsigned Opcode, int Operand) const; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; bool shouldScalarizeBinop(SDValue VecOp) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; int getLegalZfaFPImm(const APFloat &Imm, EVT VT) const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index c042782389f18..0d3906d47227f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1974,8 +1974,8 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost( } auto getConstantMatCost = - [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { - if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) + [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { + if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand)) // Two sub-cases: // * Has a 5 bit immediate operand which can be splatted. // * Has a larger immediate which must be materialized in scalar register @@ -2289,3 +2289,141 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( } return Considerable; } + +bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const { + switch (Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::ICmp: + case Instruction::FCmp: + return true; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::Select: + return Operand == 1; + default: + return false; + } +} + +bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const { + if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) + return false; + + if (canSplatOperand(I->getOpcode(), Operand)) + return true; + + auto *II = dyn_cast(I); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + case Intrinsic::vp_fma: + return Operand == 0 || Operand == 1; + case Intrinsic::vp_shl: + case Intrinsic::vp_lshr: + case Intrinsic::vp_ashr: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + case Intrinsic::ssub_sat: + case Intrinsic::vp_ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::vp_usub_sat: + return Operand == 1; + // These intrinsics are commutative. + case Intrinsic::vp_add: + case Intrinsic::vp_mul: + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_fadd: + case Intrinsic::vp_fmul: + case Intrinsic::vp_icmp: + case Intrinsic::vp_fcmp: + case Intrinsic::smin: + case Intrinsic::vp_smin: + case Intrinsic::umin: + case Intrinsic::vp_umin: + case Intrinsic::smax: + case Intrinsic::vp_smax: + case Intrinsic::umax: + case Intrinsic::vp_umax: + case Intrinsic::sadd_sat: + case Intrinsic::vp_sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::vp_uadd_sat: + // These intrinsics have 'vr' versions. + case Intrinsic::vp_sub: + case Intrinsic::vp_fsub: + case Intrinsic::vp_fdiv: + return Operand == 0 || Operand == 1; + default: + return false; + } +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// splats of scalars can fold into vector instructions. +bool RISCVTTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + if (!I->getType()->isVectorTy() || !ST->hasVInstructions()) + return false; + + // Don't sink splat operands if the target prefers it. Some targets requires + // S2V transfer buffers and we can run out of them copying the same value + // repeatedly. + // FIXME: It could still be worth doing if it would improve vector register + // pressure and prevent a vector spill. + if (!ST->sinkSplatOperands()) + return false; + + for (auto OpIdx : enumerate(I->operands())) { + if (!canSplatOperand(I, OpIdx.index())) + continue; + + Instruction *Op = dyn_cast(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + // We are looking for a splat that can be sunk. + if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + + // Don't sink i1 splats. + if (cast(Op->getType())->getElementType()->isIntegerTy(1)) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast(U.getUser()); + if (!canSplatOperand(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); + } + return true; +} \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 65bbd90550855..3f50bd86b9b3b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -412,6 +412,15 @@ class RISCVTTIImpl : public BasicTTIImplBase { shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); std::optional getMinPageSize() const { return 4096; } + /// Return true if the (vector) instruction I will be lowered to an + /// instruction with a scalar splat operand for the given Operand number. + bool canSplatOperand(Instruction *I, int Operand) const; + /// Return true if a vector instruction will lower to a target instruction + /// able to splat the given operand. + bool canSplatOperand(unsigned Opcode, int Operand) const; + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index fa78bf38f426c..5f76d666823e2 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -843,30 +843,6 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal( return isa(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA); } -bool WebAssemblyTargetLowering::shouldSinkOperands( - Instruction *I, SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - if (!I->getType()->isVectorTy() || !I->isShift()) - return false; - - Value *V = I->getOperand(1); - // We dont need to sink constant splat. - if (dyn_cast(V)) - return false; - - if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), - m_Value(), m_ZeroMask()))) { - // Sink insert - Ops.push_back(&cast(V)->getOperandUse(0)); - // Sink shuffle - Ops.push_back(&I->getOperandUse(1)); - return true; - } - - return false; -} - EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 7d9cfb7739e43..139b064aa0423 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -76,8 +76,6 @@ class WebAssemblyTargetLowering final : public TargetLowering { bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index b109594811d97..9fe5e5f27f8da 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -154,3 +154,27 @@ void WebAssemblyTTIImpl::getUnrollingPreferences( bool WebAssemblyTTIImpl::supportsTailCalls() const { return getST()->hasTailCall(); } + +bool WebAssemblyTTIImpl::isProfitableToSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + if (!I->getType()->isVectorTy() || !I->isShift()) + return false; + + Value *V = I->getOperand(1); + // We dont need to sink constant splat. + if (dyn_cast(V)) + return false; + + if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + m_Value(), m_ZeroMask()))) { + // Sink insert + Ops.push_back(&cast(V)->getOperandUse(0)); + // Sink shuffle + Ops.push_back(&I->getOperandUse(1)); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index ac3a333991684..2ce6cbf3ba026 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -71,12 +71,16 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { TTI::ReductionShuffle getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; - /// @} bool areInlineCompatible(const Function *Caller, const Function *Callee) const; bool supportsTailCalls() const; + + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + /// @} }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 73f7f52846f62..5d922d8cc02dc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34662,29 +34662,6 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; } -bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { - unsigned Bits = Ty->getScalarSizeInBits(); - - // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. - // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. - if (Subtarget.hasXOP() && - (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) - return false; - - // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable - // shifts just as cheap as scalar ones. - if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) - return false; - - // AVX512BW has shifts such as vpsllvw. - if (Subtarget.hasBWI() && Bits == 16) - return false; - - // Otherwise, it's significantly cheaper to shift by a scalar amount than by a - // fully general vector. - return true; -} - bool X86TargetLowering::isBinOp(unsigned Opcode) const { switch (Opcode) { // These are non-commutative binops. @@ -34799,63 +34776,6 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } -bool X86TargetLowering::shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const { - using namespace llvm::PatternMatch; - - FixedVectorType *VTy = dyn_cast(I->getType()); - if (!VTy) - return false; - - if (I->getOpcode() == Instruction::Mul && - VTy->getElementType()->isIntegerTy(64)) { - for (auto &Op : I->operands()) { - // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or - // the PMULUDQ pattern where the input is a zext_inreg from vXi32. - if (Subtarget.hasSSE41() && - match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), - m_SpecificInt(32)))) { - Ops.push_back(&cast(Op)->getOperandUse(0)); - Ops.push_back(&Op); - } else if (Subtarget.hasSSE2() && - match(Op.get(), - m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { - Ops.push_back(&Op); - } - } - - return !Ops.empty(); - } - - // A uniform shift amount in a vector shift or funnel shift may be much - // cheaper than a generic variable vector shift, so make that pattern visible - // to SDAG by sinking the shuffle instruction next to the shift. - int ShiftAmountOpNum = -1; - if (I->isShift()) - ShiftAmountOpNum = 1; - else if (auto *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::fshl || - II->getIntrinsicID() == Intrinsic::fshr) - ShiftAmountOpNum = 2; - } - - if (ShiftAmountOpNum == -1) - return false; - - auto *Shuf = dyn_cast(I->getOperand(ShiftAmountOpNum)); - if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && - isVectorShiftByScalarCheap(I->getType())) { - Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); - return true; - } - - return false; -} - bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const { if (!Subtarget.is64Bit()) return false; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0ab42f032c3ea..a2515ff35e692 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1404,10 +1404,6 @@ namespace llvm { bool isLegalStoreImmediate(int64_t Imm) const override; - /// This is used to enable splatted operand transforms for vector shifts - /// and vector funnel shifts. - bool isVectorShiftByScalarCheap(Type *Ty) const override; - /// Add x86-specific opcodes to the default list. bool isBinOp(unsigned Opcode) const override; @@ -1434,8 +1430,6 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool shouldSinkOperands(Instruction *I, - SmallVectorImpl &Ops) const override; bool shouldConvertPhiType(Type *From, Type *To) const override; /// Return true if folding a vector load into ExtVal (a sign, zero, or any diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 46bc73c5e928e..33b0ef08c5b9e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6894,3 +6894,82 @@ InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { // TODO: Hook MispredictPenalty of SchedMachineModel into this. return 14; } + +bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { + unsigned Bits = Ty->getScalarSizeInBits(); + + // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. + // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. + if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) + return false; + + // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable + // shifts just as cheap as scalar ones. + if (ST->hasAVX2() && (Bits == 32 || Bits == 64)) + return false; + + // AVX512BW has shifts such as vpsllvw. + if (ST->hasBWI() && Bits == 16) + return false; + + // Otherwise, it's significantly cheaper to shift by a scalar amount than by a + // fully general vector. + return true; +} + +bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + using namespace llvm::PatternMatch; + + FixedVectorType *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + if (I->getOpcode() == Instruction::Mul && + VTy->getElementType()->isIntegerTy(64)) { + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or + // the PMULUDQ pattern where the input is a zext_inreg from vXi32. + if (ST->hasSSE41() && + match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), + m_SpecificInt(32)))) { + Ops.push_back(&cast(Op)->getOperandUse(0)); + Ops.push_back(&Op); + } else if (ST->hasSSE2() && + match(Op.get(), + m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { + Ops.push_back(&Op); + } + } + + return !Ops.empty(); + } + + // A uniform shift amount in a vector shift or funnel shift may be much + // cheaper than a generic variable vector shift, so make that pattern visible + // to SDAG by sinking the shuffle instruction next to the shift. + int ShiftAmountOpNum = -1; + if (I->isShift()) + ShiftAmountOpNum = 1; + else if (auto *II = dyn_cast(I)) { + if (II->getIntrinsicID() == Intrinsic::fshl || + II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmountOpNum = 2; + } + + if (ShiftAmountOpNum == -1) + return false; + + auto *Shuf = dyn_cast(I->getOperand(ShiftAmountOpNum)); + if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && + isVectorShiftByScalarCheap(I->getType())) { + Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index c16461b157e07..0100f328ab4bd 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -297,6 +297,11 @@ class X86TTIImpl : public BasicTTIImplBase { InstructionCost getBranchMispredictPenalty() const; + bool isProfitableToSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const; + + bool isVectorShiftByScalarCheap(Type *Ty) const; + private: bool supportsGather() const; InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,