Skip to content

Commit 457d13d

Browse files
committed
[AMDGPU] Limit promoting allocas that have users with dynamic index above a threshold on number of elements
AMDGPU backend has poor code generation (scalarized copy) for extracting subvectors with dynamic index that can impact compile-time, reg-pressure, etc. For vectors with large number of elements (i.e. <128 x i8> with <32 x i8> user), dynamic indexing will blow up compile-time in GreedyRA. Added check in GEP to see if it's used in a load. Added testcase to test different number of elements in subvector user.
1 parent d7b5469 commit 457d13d

File tree

2 files changed

+102
-0
lines changed

2 files changed

+102
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ static cl::opt<unsigned>
8585
"when sorting profitable allocas"),
8686
cl::init(4));
8787

88+
static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
89+
cl::desc("Maximum number of elements for promoting alloca with dynamic"
90+
" index"),
91+
cl::init(8));
92+
8893
// Shared implementation which can do both promotion to vector and to LDS.
8994
class AMDGPUPromoteAllocaImpl {
9095
private:
@@ -919,6 +924,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
919924
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
920925
if (!Index)
921926
return RejectUser(Inst, "cannot compute vector index for GEP");
927+
928+
if (!isa<ConstantInt>(Index)) {
929+
bool UsedInLoad = false;
930+
for (auto *U : GEP->users()) {
931+
if(isa<LoadInst>(U)) {
932+
UsedInLoad = true;
933+
break;
934+
}
935+
}
936+
if (auto *UserVecTy = dyn_cast<FixedVectorType>(
937+
GEP->getSourceElementType())) {
938+
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
939+
return RejectUser(Inst,
940+
"user has too many number of elements for dynamic index");
941+
}
942+
}
943+
}
922944

923945
GEPVectorIdx[GEP] = Index;
924946
UsersToRemove.push_back(Inst);

llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
; Check that invalid IR is not produced on a vector typed
55
; getelementptr with a scalar alloca pointer base.
6+
; Also check if GEP with dynamic index is rejected above
7+
; threshold # of elements.
68

79
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
810
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
@@ -250,6 +252,84 @@ bb2:
250252
store i32 0, ptr addrspace(5) %extractelement
251253
ret void
252254
}
255+
256+
define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
257+
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
258+
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
259+
; CHECK-NEXT: [[ENTRY:.*:]]
260+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
261+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
262+
; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
263+
; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
264+
; CHECK-NEXT: ret void
265+
;
266+
entry:
267+
%alloca = alloca [64 x i8], align 4, addrspace(5)
268+
%gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
269+
%vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
270+
store <16 x i8> %vec, ptr addrspace(1) %out, align 4
271+
ret void
272+
}
273+
274+
define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
275+
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
276+
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
277+
; CHECK-NEXT: [[ENTRY:.*:]]
278+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
279+
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
280+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
281+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
282+
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
283+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
284+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
285+
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
286+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
287+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
288+
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
289+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
290+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
291+
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
292+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
293+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
294+
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
295+
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
296+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
297+
; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
298+
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
299+
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
300+
; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
301+
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
302+
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
303+
; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
304+
; CHECK-NEXT: ret void
305+
;
306+
entry:
307+
%alloca = alloca [64 x i8], align 4, addrspace(5)
308+
%gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
309+
%vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
310+
store <8 x i8> %vec, ptr addrspace(1) %out, align 4
311+
ret void
312+
}
313+
314+
define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
315+
; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
316+
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
317+
; CHECK-NEXT: [[ENTRY:.*:]]
318+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
319+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
320+
; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
321+
; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
322+
; CHECK-NEXT: ret void
323+
;
324+
entry:
325+
%alloca = alloca [64 x i8], align 4, addrspace(5)
326+
%gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
327+
%gepint = ptrtoint ptr addrspace(5) %gep to i64
328+
store i64 %gepint, ptr addrspace(1) %out, align 4
329+
ret void
330+
}
331+
332+
253333
;.
254334
; CHECK: [[META0]] = !{}
255335
; CHECK: [[RNG1]] = !{i32 0, i32 1025}

0 commit comments

Comments
 (0)