@@ -66,6 +66,19 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6666 cl::desc (" Maximum byte size to consider promote alloca to vector" ),
6767 cl::init(0 ));
6868
69+ static cl::opt<unsigned > PromoteAllocaToVectorMaxRegs (
70+ " amdgpu-promote-alloca-to-vector-max-regs" ,
71+ cl::desc (
72+ " Maximum vector size (in 32b registers) to use when promoting alloca" ),
73+ cl::init(16 ));
74+
75+ // Use up to 1/4 of available register budget for vectorization.
76+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
77+ static cl::opt<unsigned > PromoteAllocaToVectorVGPRRatio (
78+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
79+ cl::desc (" Ratio of VGPRs to budget for promoting alloca to vectors" ),
80+ cl::init(4 ));
81+
6982static cl::opt<unsigned >
7083 LoopUserWeight (" promote-alloca-vector-loop-user-weight" ,
7184 cl::desc (" The bonus weight of users of allocas within loop "
@@ -84,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
8497 uint32_t LocalMemLimit = 0 ;
8598 uint32_t CurrentLocalMemUsage = 0 ;
8699 unsigned MaxVGPRs;
100+ unsigned VGPRBudgetRatio;
101+ unsigned MaxVectorRegs;
87102
88103 bool IsAMDGCN = false ;
89104 bool IsAMDHSA = false ;
@@ -112,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
112127
113128 void sortAllocasToPromote (SmallVectorImpl<AllocaInst *> &Allocas);
114129
130+ void setFunctionLimits (const Function &F);
131+
115132public:
116133 AMDGPUPromoteAllocaImpl (TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117134
@@ -298,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
298315 // clang-format on
299316}
300317
318+ void AMDGPUPromoteAllocaImpl::setFunctionLimits (const Function &F) {
319+ // Load per function limits, overriding with global options where appropriate.
320+ MaxVectorRegs = F.getFnAttributeAsParsedInteger (
321+ " amdgpu-promote-alloca-to-vector-max-regs" , PromoteAllocaToVectorMaxRegs);
322+ if (PromoteAllocaToVectorMaxRegs.getNumOccurrences ())
323+ MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
324+ VGPRBudgetRatio = F.getFnAttributeAsParsedInteger (
325+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
326+ PromoteAllocaToVectorVGPRRatio);
327+ if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences ())
328+ VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
329+ }
330+
301331bool AMDGPUPromoteAllocaImpl::run (Function &F, bool PromoteToLDS) {
302332 Mod = F.getParent ();
303333 DL = &Mod->getDataLayout ();
@@ -307,15 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
307337 return false ;
308338
309339 MaxVGPRs = getMaxVGPRs (TM, F);
340+ setFunctionLimits (F);
310341
311342 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem (F);
312343
313- // Use up to 1/4 of available register budget for vectorization.
314- // FIXME: Increase the limit for whole function budgets? Perhaps x2?
315344 unsigned VectorizationBudget =
316345 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317346 : (MaxVGPRs * 32 )) /
318- 4 ;
347+ VGPRBudgetRatio ;
319348
320349 SmallVector<AllocaInst *, 16 > Allocas;
321350 for (Instruction &I : F.getEntryBlock ()) {
@@ -400,7 +429,8 @@ static Value *calculateVectorIndex(
400429}
401430
402431static Value *GEPToVectorIndex (GetElementPtrInst *GEP, AllocaInst *Alloca,
403- Type *VecElemTy, const DataLayout &DL) {
432+ Type *VecElemTy, const DataLayout &DL,
433+ SmallVector<Instruction *> &NewInsts) {
404434 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
405435 // helper.
406436 unsigned BW = DL.getIndexTypeSizeInBits (GEP->getType ());
@@ -414,22 +444,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
414444 if (VarOffsets.size () > 1 )
415445 return nullptr ;
416446
417- if (VarOffsets.size () == 1 ) {
418- // Only handle cases where we don't need to insert extra arithmetic
419- // instructions.
420- const auto &VarOffset = VarOffsets.front ();
421- if (!ConstOffset.isZero () || VarOffset.second != VecElemSize)
422- return nullptr ;
423- return VarOffset.first ;
424- }
425-
426447 APInt Quot;
427448 uint64_t Rem;
428449 APInt::udivrem (ConstOffset, VecElemSize, Quot, Rem);
429450 if (Rem != 0 )
430451 return nullptr ;
431452
432- return ConstantInt::get (GEP->getContext (), Quot);
453+ ConstantInt *ConstIndex = ConstantInt::get (GEP->getContext (), Quot);
454+ if (VarOffsets.size () == 0 )
455+ return ConstIndex;
456+
457+ IRBuilder<> Builder (GEP);
458+
459+ const auto &VarOffset = VarOffsets.front ();
460+ APInt::udivrem (VarOffset.second , VecElemSize, Quot, Rem);
461+ if (Rem != 0 || Quot.isZero ())
462+ return nullptr ;
463+
464+ Value *Offset = VarOffset.first ;
465+ if (!Quot.isOne ()) {
466+ ConstantInt *ConstMul = ConstantInt::get (GEP->getContext (), Quot);
467+ Offset = Builder.CreateMul (Offset, ConstMul);
468+ if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
469+ NewInsts.push_back (NewInst);
470+ }
471+ if (ConstOffset.isZero ())
472+ return Offset;
473+
474+ Value *IndexAdd = Builder.CreateAdd (ConstIndex, Offset);
475+ if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
476+ NewInsts.push_back (NewInst);
477+ return IndexAdd;
433478}
434479
435480// / Promotes a single user of the alloca to a vector form.
@@ -737,23 +782,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
737782 Type *AllocaTy = Alloca.getAllocatedType ();
738783 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
739784 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
740- if (VectorType::isValidElementType (ArrayTy->getElementType ()) &&
741- ArrayTy->getNumElements () > 0 )
742- VectorTy = FixedVectorType::get (ArrayTy->getElementType (),
743- ArrayTy->getNumElements ());
785+ uint64_t NumElems = 1 ;
786+ Type *ElemTy;
787+ do {
788+ NumElems *= ArrayTy->getNumElements ();
789+ ElemTy = ArrayTy->getElementType ();
790+ } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
791+
792+ // Check for array of vectors
793+ auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
794+ if (InnerVectorTy) {
795+ NumElems *= InnerVectorTy->getNumElements ();
796+ ElemTy = InnerVectorTy->getElementType ();
797+ }
798+
799+ if (VectorType::isValidElementType (ElemTy) && NumElems > 0 ) {
800+ unsigned ElementSize = DL->getTypeSizeInBits (ElemTy) / 8 ;
801+ unsigned AllocaSize = DL->getTypeStoreSize (AllocaTy);
802+ // Expand vector if required to match padding of inner type,
803+ // i.e. odd size subvectors.
804+ // Storage size of new vector must match that of alloca for correct
805+ // behaviour of byte offsets and GEP computation.
806+ if (NumElems * ElementSize != AllocaSize)
807+ NumElems = AllocaSize / ElementSize;
808+ if (NumElems > 0 && (AllocaSize % ElementSize) == 0 )
809+ VectorTy = FixedVectorType::get (ElemTy, NumElems);
810+ }
744811 }
745812
746- // FIXME: There is no reason why we can't support larger arrays, we
747- // are just being conservative for now.
748- // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
749- // equivalent. Potentially these could also be promoted but we don't currently
750- // handle this case
751813 if (!VectorTy) {
752814 LLVM_DEBUG (dbgs () << " Cannot convert type to vector\n " );
753815 return false ;
754816 }
755817
756- if (VectorTy->getNumElements () > 16 || VectorTy->getNumElements () < 2 ) {
818+ const unsigned MaxElements =
819+ (MaxVectorRegs * 32 ) / DL->getTypeSizeInBits (VectorTy->getElementType ());
820+
821+ if (VectorTy->getNumElements () > MaxElements ||
822+ VectorTy->getNumElements () < 2 ) {
757823 LLVM_DEBUG (dbgs () << " " << *VectorTy
758824 << " has an unsupported number of elements\n " );
759825 return false ;
@@ -763,11 +829,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
763829 SmallVector<Instruction *> WorkList;
764830 SmallVector<Instruction *> UsersToRemove;
765831 SmallVector<Instruction *> DeferredInsts;
832+ SmallVector<Instruction *> NewGEPInsts;
766833 DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
767834
768835 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
769836 LLVM_DEBUG (dbgs () << " Cannot promote alloca to vector: " << Msg << " \n "
770837 << " " << *Inst << " \n " );
838+ for (auto *Inst : reverse (NewGEPInsts))
839+ Inst->eraseFromParent ();
771840 return false ;
772841 };
773842
@@ -817,7 +886,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
817886 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
818887 // If we can't compute a vector index from this GEP, then we can't
819888 // promote this alloca to vector.
820- Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL);
889+ Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL, NewGEPInsts );
821890 if (!Index)
822891 return RejectUser (Inst, " cannot compute vector index for GEP" );
823892
0 commit comments