diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 7b0475ac2481d..c88bf44c9b1e0 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1381,6 +1381,16 @@ class MachineIRBuilder { Res, Val, buildConstant(LLT::scalar(VecIdxWidth), Idx)); } + MachineInstrBuilder buildInsertVectorElementConstant(const DstOp &Res, + const SrcOp &Val, + const SrcOp &Elt, + const int Idx) { + auto TLI = getMF().getSubtarget().getTargetLowering(); + unsigned VecIdxWidth = TLI->getVectorIdxTy(getDataLayout()).getSizeInBits(); + return buildInsertVectorElement( + Res, Val, Elt, buildConstant(LLT::scalar(VecIdxWidth), Idx)); + } + /// Build and insert \p Res = G_EXTRACT_VECTOR_ELT \p Val, \p Idx /// /// \pre setBasicBlock or setMI must have been called. diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 3fb1347b58e4b..cc67d38308e18 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -124,6 +124,9 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI, if (isa(MI)) return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize; + if (isa(MI)) + return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized + : UnableToLegalize; auto Step = LI.getAction(MI, MRI); switch (Step.Action) { case Legal: diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 625d556e3ff5e..76e498650d9d8 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -467,6 +467,7 @@ std::optional llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) { Register DefSrcReg = Reg; auto *DefMI = MRI.getVRegDef(Reg); + assert(DefMI && "expected non-null machine instr"); auto DstTy = MRI.getType(DefMI->getOperand(0).getReg()); if (!DstTy.isValid()) return std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 740e52fb87dc2..49fb73363a760 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2144,6 +2144,8 @@ bool AMDGPULegalizerInfo::legalizeCustom( return legalizeExtractVectorElt(MI, MRI, B); case TargetOpcode::G_INSERT_VECTOR_ELT: return legalizeInsertVectorElt(MI, MRI, B); + case TargetOpcode::G_INSERT_SUBVECTOR: + return legalizeInsertSubVector(MI, MRI, B); case TargetOpcode::G_FSIN: case TargetOpcode::G_FCOS: return legalizeSinCos(MI, MRI, B); @@ -2838,6 +2840,64 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; } +// This lowers an G_INSERT_SUBVECTOR by extracting the individual elements from +// the small vector and inserting them into the big vector. That is better than +// the default expansion of doing it via a stack slot. Even though the use of +// the stack slot would be optimized away afterwards, the stack slot itself +// remains. +bool AMDGPULegalizerInfo::legalizeInsertSubVector(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + + GInsertSubvector *ES = cast(&MI); + Register Vec = ES->getBigVec(); + Register Ins = ES->getSubVec(); + uint64_t IdxVal = ES->getIndexImm(); + + LLT VecVT = MRI.getType(Vec); + LLT InsVT = MRI.getType(Ins); + LLT EltVT = VecVT.getElementType(); + assert(VecVT.getElementType() == InsVT.getElementType()); + + ElementCount InsVTEC = InsVT.getElementCount(); + auto InsNumElts = InsVTEC.getKnownMinValue(); + + if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { + // Insert 32-bit registers at a time. + assert(InsNumElts % 2 == 0 && "expect legal vector types"); + + ElementCount VecVTEC = VecVT.getElementCount(); + LLT NewVecVT = LLT::vector(VecVTEC.divideCoefficientBy(2), S32); + LLT NewInsVT = InsNumElts == 2 + ? S32 + : LLT::vector(InsVTEC.divideCoefficientBy(2), S32); + + auto VecB = B.buildBitcast(NewVecVT, Vec); + auto InsB = B.buildBitcast(NewInsVT, Ins); + + for (unsigned I = 0; I != InsNumElts / 2; ++I) { + MachineInstrBuilder Elt; + if (InsNumElts == 2) { + Elt = InsB; + } else { + Elt = B.buildExtractVectorElementConstant(S32, InsB, I); + } + VecB = B.buildInsertVectorElementConstant(NewVecVT, VecB, Elt, IdxVal / 2 + I); + } + auto R = B.buildBitcast(VecVT, VecB); + MI.eraseFromParent(); + return true; + } + + for (unsigned I = 0; I != InsNumElts; ++I) { + auto Elt = B.buildExtractVectorElementConstant(EltVT, Ins, I); + Vec = B.buildInsertVectorElementConstant(VecVT, Vec, Elt, IdxVal + I).getReg(0); + } + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeSinCos( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 03b7c36fc450f..78d4d8bd8a3f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -60,6 +60,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { MachineIRBuilder &B) const; bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeInsertSubVector(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const;