@@ -66,6 +66,18 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6666 cl::desc (" Maximum byte size to consider promote alloca to vector" ),
6767 cl::init(0 ));
6868
69+ static cl::opt<unsigned > PromoteAllocaToVectorMaxElements (
70+ " amdgpu-promote-alloca-to-vector-max-elements" ,
71+ cl::desc (" Maximum vector size (in elements) to use when promoting alloca" ),
72+ cl::init(16 ));
73+
74+ // Use up to 1/4 of available register budget for vectorization.
75+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
76+ static cl::opt<unsigned > PromoteAllocaToVectorVGPRRatio (
77+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
78+ cl::desc (" Ratio of VGPRs to budget for promoting alloca to vectors" ),
79+ cl::init(4 ));
80+
6981static cl::opt<unsigned >
7082 LoopUserWeight (" promote-alloca-vector-loop-user-weight" ,
7183 cl::desc (" The bonus weight of users of allocas within loop "
@@ -310,12 +322,17 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
310322
311323 bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem (F) : false ;
312324
313- // Use up to 1/4 of available register budget for vectorization.
314- // FIXME: Increase the limit for whole function budgets? Perhaps x2?
325+ const unsigned VGPRRatio =
326+ PromoteAllocaToVectorVGPRRatio.getNumOccurrences ()
327+ ? PromoteAllocaToVectorVGPRRatio
328+ : F.getFnAttributeAsParsedInteger (
329+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
330+ PromoteAllocaToVectorVGPRRatio);
331+
315332 unsigned VectorizationBudget =
316333 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317334 : (MaxVGPRs * 32 )) /
318- 4 ;
335+ VGPRRatio ;
319336
320337 SmallVector<AllocaInst *, 16 > Allocas;
321338 for (Instruction &I : F.getEntryBlock ()) {
@@ -398,7 +415,8 @@ calculateVectorIndex(Value *Ptr,
398415}
399416
400417static Value *GEPToVectorIndex (GetElementPtrInst *GEP, AllocaInst *Alloca,
401- Type *VecElemTy, const DataLayout &DL) {
418+ Type *VecElemTy, const DataLayout &DL,
419+ SmallVector<Instruction *> &NewInsts) {
402420 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
403421 // helper.
404422 unsigned BW = DL.getIndexTypeSizeInBits (GEP->getType ());
@@ -412,22 +430,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
412430 if (VarOffsets.size () > 1 )
413431 return nullptr ;
414432
415- if (VarOffsets.size () == 1 ) {
416- // Only handle cases where we don't need to insert extra arithmetic
417- // instructions.
418- const auto &VarOffset = VarOffsets.front ();
419- if (!ConstOffset.isZero () || VarOffset.second != VecElemSize)
420- return nullptr ;
421- return VarOffset.first ;
422- }
423-
424433 APInt Quot;
425434 uint64_t Rem;
426435 APInt::udivrem (ConstOffset, VecElemSize, Quot, Rem);
427436 if (Rem != 0 )
428437 return nullptr ;
429438
430- return ConstantInt::get (GEP->getContext (), Quot);
439+ ConstantInt *ConstIndex = ConstantInt::get (GEP->getContext (), Quot);
440+ if (VarOffsets.size () == 0 )
441+ return ConstIndex;
442+
443+ IRBuilder<> Builder (GEP);
444+
445+ const auto &VarOffset = VarOffsets.front ();
446+ APInt::udivrem (VarOffset.second , VecElemSize, Quot, Rem);
447+ if (Rem != 0 || Quot.isZero ())
448+ return nullptr ;
449+
450+ Value *Offset = VarOffset.first ;
451+ if (!Quot.isOne ()) {
452+ ConstantInt *ConstMul = ConstantInt::get (GEP->getContext (), Quot);
453+ Offset = Builder.CreateMul (Offset, ConstMul);
454+ if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
455+ NewInsts.push_back (NewInst);
456+ }
457+ if (ConstOffset.isZero ())
458+ return Offset;
459+
460+ Value *IndexAdd = Builder.CreateAdd (ConstIndex, Offset);
461+ if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
462+ NewInsts.push_back (NewInst);
463+ return IndexAdd;
431464}
432465
433466// / Promotes a single user of the alloca to a vector form.
@@ -735,23 +768,48 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
735768 Type *AllocaTy = Alloca.getAllocatedType ();
736769 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
737770 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
738- if (VectorType::isValidElementType (ArrayTy->getElementType ()) &&
739- ArrayTy->getNumElements () > 0 )
740- VectorTy = FixedVectorType::get (ArrayTy->getElementType (),
741- ArrayTy->getNumElements ());
771+ uint64_t NumElems = 1 ;
772+ Type *ElemTy;
773+ do {
774+ NumElems *= ArrayTy->getNumElements ();
775+ ElemTy = ArrayTy->getElementType ();
776+ } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
777+
778+ // Check for array of vectors
779+ auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
780+ if (InnerVectorTy) {
781+ NumElems *= InnerVectorTy->getNumElements ();
782+ ElemTy = InnerVectorTy->getElementType ();
783+ }
784+
785+ if (VectorType::isValidElementType (ElemTy) && NumElems > 0 ) {
786+ unsigned ElementSize = DL->getTypeSizeInBits (ElemTy) / 8 ;
787+ unsigned AllocaSize = DL->getTypeStoreSize (AllocaTy);
788+ // Expand vector if required to match padding of inner type,
789+ // i.e. odd size subvectors.
790+ // Storage size of new vector must match that of alloca for correct
791+ // behaviour of byte offsets and GEP computation.
792+ if (NumElems * ElementSize != AllocaSize)
793+ NumElems = AllocaSize / ElementSize;
794+ if (NumElems > 0 && (AllocaSize % ElementSize) == 0 )
795+ VectorTy = FixedVectorType::get (ElemTy, NumElems);
796+ }
742797 }
743798
744- // FIXME: There is no reason why we can't support larger arrays, we
745- // are just being conservative for now.
746- // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
747- // equivalent. Potentially these could also be promoted but we don't currently
748- // handle this case
749799 if (!VectorTy) {
750800 LLVM_DEBUG (dbgs () << " Cannot convert type to vector\n " );
751801 return false ;
752802 }
753803
754- if (VectorTy->getNumElements () > 16 || VectorTy->getNumElements () < 2 ) {
804+ const unsigned MaxElements =
805+ PromoteAllocaToVectorMaxElements.getNumOccurrences ()
806+ ? PromoteAllocaToVectorMaxElements
807+ : Alloca.getParent ()->getParent ()->getFnAttributeAsParsedInteger (
808+ " amdgpu-promote-alloca-to-vector-max-elements" ,
809+ PromoteAllocaToVectorMaxElements);
810+
811+ if (VectorTy->getNumElements () > MaxElements ||
812+ VectorTy->getNumElements () < 2 ) {
755813 LLVM_DEBUG (dbgs () << " " << *VectorTy
756814 << " has an unsupported number of elements\n " );
757815 return false ;
@@ -761,11 +819,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
761819 SmallVector<Instruction *> WorkList;
762820 SmallVector<Instruction *> UsersToRemove;
763821 SmallVector<Instruction *> DeferredInsts;
822+ SmallVector<Instruction *> NewGEPInsts;
764823 DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
765824
766825 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
767826 LLVM_DEBUG (dbgs () << " Cannot promote alloca to vector: " << Msg << " \n "
768827 << " " << *Inst << " \n " );
828+ for (auto *Inst : reverse (NewGEPInsts))
829+ Inst->eraseFromParent ();
769830 return false ;
770831 };
771832
@@ -815,7 +876,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
815876 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
816877 // If we can't compute a vector index from this GEP, then we can't
817878 // promote this alloca to vector.
818- Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL);
879+ Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL, NewGEPInsts );
819880 if (!Index)
820881 return RejectUser (Inst, " cannot compute vector index for GEP" );
821882
0 commit comments