Skip to content
19 changes: 19 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));

static cl::opt<unsigned> DynamicIndexNumberElementLimit(
"amdgpu-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
cl::init(8));

// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
Expand Down Expand Up @@ -920,6 +926,19 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");

if (!isa<ConstantInt>(Index)) {
bool UsedInLoad = llvm::any_of(
GEP->users(), [&](const auto *U) { return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad &&
UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) {
return RejectUser(Inst,
"user has too many elements for dynamic index");
}
}
}

GEPVectorIdx[GEP] = Index;
UsersToRemove.push_back(Inst);
continue;
Expand Down
80 changes: 80 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

; Check that invalid IR is not produced on a vector typed
; getelementptr with a scalar alloca pointer base.
; Also check if GEP with dynamic index is rejected above
; threshold # of elements.

define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
Expand Down Expand Up @@ -250,6 +252,84 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}

define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [64 x i8], align 4, addrspace(5)
%gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
%vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
store <16 x i8> %vec, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [64 x i8], align 4, addrspace(5)
%gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
%vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
store <8 x i8> %vec, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [64 x i8], align 4, addrspace(5)
%gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
%gepint = ptrtoint ptr addrspace(5) %gep to i64
store i64 %gepint, ptr addrspace(1) %out, align 4
ret void
}


;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
Expand Down
Loading