From 97f7c78f07b35264305b54d1ce400a83c6282e42 Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Tue, 2 Dec 2025 09:51:10 -0600 Subject: [PATCH 01/12] [AMDGPU] Limit promoting allocas that have users with dynamic index above a threshold on number of elements AMDGPU backend has poor code generation (scalarized copy) for extracting subvectors with dynamic index that can impact compile-time, reg-pressure, etc. For vectors with large number of elements (i.e. <128 x i8> with <32 x i8> user), dynamic indexing will blow up compile-time in GreedyRA. Added check in GEP to see if it's used in a load. Added testcase to test different number of elements in subvector user. --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 22 +++++ .../AMDGPU/promote-alloca-vector-gep.ll | 80 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index bb95265a794a0..aba660ffb6e45 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,6 +85,11 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); +static cl::opt DynIdxNumElmLimit("dynamic-index-num-element-limit", + cl::desc("Maximum number of elements for promoting alloca with dynamic" + " index"), + cl::init(8)); + // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: @@ -919,6 +924,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); + + if (!isa(Index)) { + bool UsedInLoad = false; + for (auto *U : GEP->users()) { + if(isa(U)) { + UsedInLoad = true; + break; + } + } + if (auto *UserVecTy = dyn_cast( + GEP->getSourceElementType())) { + if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) { + return RejectUser(Inst, + "user has too many number of elements for dynamic index"); + } + } + } GEPVectorIdx[GEP] = Index; UsersToRemove.push_back(Inst); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll index 76e1868b3c4b9..caab29b58c13f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -3,6 +3,8 @@ ; Check that invalid IR is not produced on a vector typed ; getelementptr with a scalar alloca pointer base. +; Also check if GEP with dynamic index is rejected above +; threshold # of elements. define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { @@ -250,6 +252,84 @@ bb2: store i32 0, ptr addrspace(5) %extractelement ret void } + +define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) { +; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4 +; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4 + store <16 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) { +; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4 + store <8 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) { +; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx + %gepint = ptrtoint ptr addrspace(5) %gep to i64 + store i64 %gepint, ptr addrspace(1) %out, align 4 + ret void +} + + ;. ; CHECK: [[META0]] = !{} ; CHECK: [[RNG1]] = !{i32 0, i32 1025} From 406a57568f58eb0016f916cf028f11c51c9ace0a Mon Sep 17 00:00:00 2001 From: Kevin Choi <5455710+choikwa@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:07:35 -0500 Subject: [PATCH 02/12] Update llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index aba660ffb6e45..783a9408e249f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -937,7 +937,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { GEP->getSourceElementType())) { if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) { return RejectUser(Inst, - "user has too many number of elements for dynamic index"); + "user has too many elements for dynamic index"); } } } From a311a654a85840de44e4a5a8d42cf65c36ed4045 Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Tue, 2 Dec 2025 16:57:33 -0600 Subject: [PATCH 03/12] NFC, formatting --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 783a9408e249f..1282641bd3325 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,9 +85,10 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); -static cl::opt DynIdxNumElmLimit("dynamic-index-num-element-limit", +static cl::opt DynIdxNumElmLimit( + "dynamic-index-num-element-limit", cl::desc("Maximum number of elements for promoting alloca with dynamic" - " index"), + " index"), cl::init(8)); // Shared implementation which can do both promotion to vector and to LDS. @@ -924,20 +925,20 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); - + if (!isa(Index)) { bool UsedInLoad = false; for (auto *U : GEP->users()) { - if(isa(U)) { + if (isa(U)) { UsedInLoad = true; break; } } - if (auto *UserVecTy = dyn_cast( - GEP->getSourceElementType())) { + if (auto *UserVecTy = + dyn_cast(GEP->getSourceElementType())) { if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) { - return RejectUser(Inst, - "user has too many elements for dynamic index"); + return RejectUser(Inst, + "user has too many elements for dynamic index"); } } } From 80fb5a41eb4d5f0a2dbba5a5d51e192492d6e6da Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 12:38:44 -0600 Subject: [PATCH 04/12] addressing feedback --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 1282641bd3325..e618b88253457 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -86,7 +86,7 @@ static cl::opt cl::init(4)); static cl::opt DynIdxNumElmLimit( - "dynamic-index-num-element-limit", + "amdgpu-dynamic-index-num-element-limit", cl::desc("Maximum number of elements for promoting alloca with dynamic" " index"), cl::init(8)); @@ -927,13 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "cannot compute vector index for GEP"); if (!isa(Index)) { - bool UsedInLoad = false; - for (auto *U : GEP->users()) { - if (isa(U)) { - UsedInLoad = true; - break; - } - } + bool UsedInLoad = llvm::any_of(GEP->users(), + [&](const auto *U){ return isa(U); }); if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) { From 671de2f770da59846de5908dfd5648cc91c7e92b Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 13:07:11 -0600 Subject: [PATCH 05/12] nfc, rename var --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e618b88253457..ec4abe281e7fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,7 +85,7 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); -static cl::opt DynIdxNumElmLimit( +static cl::opt DynamicIndexNumberElementLimit( "amdgpu-dynamic-index-num-element-limit", cl::desc("Maximum number of elements for promoting alloca with dynamic" " index"), @@ -931,7 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { [&](const auto *U){ return isa(U); }); if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { - if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) { + if (UsedInLoad && + UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) { return RejectUser(Inst, "user has too many elements for dynamic index"); } From 3b7249961aded412120eec78a8fde151024751e2 Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 13:14:17 -0600 Subject: [PATCH 06/12] nfc, formatting --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ec4abe281e7fd..912bd799db6c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -927,8 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "cannot compute vector index for GEP"); if (!isa(Index)) { - bool UsedInLoad = llvm::any_of(GEP->users(), - [&](const auto *U){ return isa(U); }); + bool UsedInLoad = llvm::any_of( + GEP->users(), [&](const auto *U){ return isa(U); }); if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { if (UsedInLoad && From 8e31b8549b7f2651780ecdba05f022cfcd844b3b Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 13:21:58 -0600 Subject: [PATCH 07/12] space --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 912bd799db6c3..38f4e07eb613f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -928,7 +928,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (!isa(Index)) { bool UsedInLoad = llvm::any_of( - GEP->users(), [&](const auto *U){ return isa(U); }); + GEP->users(), [&](const auto *U) { return isa(U); }); if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { if (UsedInLoad && From 904e5e019525e3948ada81f503e6074a41a91934 Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 13:54:35 -0600 Subject: [PATCH 08/12] addressing feedback, move tests to promote-alloca-vector-dynamic-idx.ll, test different limit values --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 +- .../promote-alloca-vector-dynamic-idx.ll | 533 ++++++++++++++++++ .../AMDGPU/promote-alloca-vector-gep.ll | 78 --- 3 files changed, 536 insertions(+), 81 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 38f4e07eb613f..3b656480770b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,8 +85,8 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); -static cl::opt DynamicIndexNumberElementLimit( - "amdgpu-dynamic-index-num-element-limit", +static cl::opt PromoteAllocaDynamicIndexNumberElementLimit( + "amdgpu-promote-alloca-dynamic-index-num-element-limit", cl::desc("Maximum number of elements for promoting alloca with dynamic" " index"), cl::init(8)); @@ -932,7 +932,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { if (UsedInLoad && - UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) { + UserVecTy->getNumElements() > PromoteAllocaDynamicIndexNumberElementLimit) { return RejectUser(Inst, "user has too many elements for dynamic index"); } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll new file mode 100644 index 0000000000000..111f6e8f8d990 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll @@ -0,0 +1,533 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=4 < %s | FileCheck -check-prefix=C4 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=C8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=16 < %s | FileCheck -check-prefix=C16 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=32 < %s | FileCheck -check-prefix=C32 %s + +; Check if alloca is promoted if user is accessed with dynamic index + +define amdgpu_kernel void @GEP_dynamic_idx_v4i8(ptr addrspace(1) %out, i32 %idx) { +; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8( +; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C4-NEXT: [[ENTRY:.*:]] +; C4-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C4-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4 +; C4-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C4-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0 +; C4-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C4-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C4-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C4-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C4-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C4-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C4-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C4-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C4-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C4-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4 +; C4-NEXT: ret void +; +; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8( +; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C8-NEXT: [[ENTRY:.*:]] +; C8-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C8-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4 +; C8-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C8-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0 +; C8-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C8-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C8-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C8-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C8-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C8-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C8-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C8-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C8-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C8-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4 +; C8-NEXT: ret void +; +; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8( +; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C16-NEXT: [[ENTRY:.*:]] +; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4 +; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C16-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0 +; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C16-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C16-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C16-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C16-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4 +; C16-NEXT: ret void +; +; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8( +; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C32-NEXT: [[ENTRY:.*:]] +; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4 +; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C32-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0 +; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C32-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C32-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C32-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C32-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4 +; C32-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <4 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <4 x i8>, ptr addrspace(5) %gep, align 4 + store <4 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) { +; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( +; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C4-NEXT: [[ENTRY:.*:]] +; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C4-NEXT: [[VEC:%.*]] = load <8 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C4-NEXT: store <8 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C4-NEXT: ret void +; +; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( +; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C8-NEXT: [[ENTRY:.*:]] +; C8-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C8-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8 +; C8-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C8-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0 +; C8-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C8-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C8-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C8-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C8-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C8-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C8-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C8-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C8-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C8-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C8-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C8-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C8-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C8-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C8-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C8-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C8-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C8-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C8-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C8-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C8-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C8-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4 +; C8-NEXT: ret void +; +; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( +; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C16-NEXT: [[ENTRY:.*:]] +; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8 +; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C16-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0 +; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C16-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C16-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C16-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C16-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C16-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C16-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C16-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C16-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C16-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C16-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C16-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C16-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C16-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C16-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C16-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C16-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4 +; C16-NEXT: ret void +; +; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( +; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C32-NEXT: [[ENTRY:.*:]] +; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8 +; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C32-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0 +; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C32-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C32-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C32-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C32-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C32-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C32-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C32-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C32-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4 +; C32-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4 + store <8 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_v16i8(ptr addrspace(1) %out, i32 %idx) { +; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8( +; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C4-NEXT: [[ENTRY:.*:]] +; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C4-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C4-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C4-NEXT: ret void +; +; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8( +; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C8-NEXT: [[ENTRY:.*:]] +; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C8-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C8-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C8-NEXT: ret void +; +; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8( +; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C16-NEXT: [[ENTRY:.*:]] +; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 16 +; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C16-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP1]], i64 0 +; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C16-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C16-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C16-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C16-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C16-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C16-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C16-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C16-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C16-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C16-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C16-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C16-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C16-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C16-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C16-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C16-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8 +; C16-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]] +; C16-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP25]], i64 8 +; C16-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9 +; C16-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]] +; C16-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP28]], i64 9 +; C16-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10 +; C16-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]] +; C16-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP31]], i64 10 +; C16-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11 +; C16-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]] +; C16-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP34]], i64 11 +; C16-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12 +; C16-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]] +; C16-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP37]], i64 12 +; C16-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13 +; C16-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]] +; C16-NEXT: [[TMP41:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP40]], i64 13 +; C16-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14 +; C16-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]] +; C16-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP43]], i64 14 +; C16-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15 +; C16-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]] +; C16-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP46]], i64 15 +; C16-NEXT: store <16 x i8> [[TMP47]], ptr addrspace(1) [[OUT]], align 4 +; C16-NEXT: ret void +; +; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8( +; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C32-NEXT: [[ENTRY:.*:]] +; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 16 +; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C32-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP1]], i64 0 +; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C32-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C32-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C32-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C32-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C32-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C32-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C32-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C32-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8 +; C32-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]] +; C32-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP25]], i64 8 +; C32-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9 +; C32-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]] +; C32-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP28]], i64 9 +; C32-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10 +; C32-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]] +; C32-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP31]], i64 10 +; C32-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11 +; C32-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]] +; C32-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP34]], i64 11 +; C32-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12 +; C32-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]] +; C32-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP37]], i64 12 +; C32-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13 +; C32-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]] +; C32-NEXT: [[TMP41:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP40]], i64 13 +; C32-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14 +; C32-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]] +; C32-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP43]], i64 14 +; C32-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15 +; C32-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]] +; C32-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP46]], i64 15 +; C32-NEXT: store <16 x i8> [[TMP47]], ptr addrspace(1) [[OUT]], align 4 +; C32-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4 + store <16 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) { +; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( +; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C4-NEXT: [[ENTRY:.*:]] +; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C4-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C4-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C4-NEXT: ret void +; +; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( +; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C8-NEXT: [[ENTRY:.*:]] +; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C8-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C8-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C8-NEXT: ret void +; +; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( +; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C16-NEXT: [[ENTRY:.*:]] +; C16-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C16-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C16-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4 +; C16-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 +; C16-NEXT: ret void +; +; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( +; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C32-NEXT: [[ENTRY:.*:]] +; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 32 +; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] +; C32-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> poison, i8 [[TMP1]], i64 0 +; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 +; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] +; C32-NEXT: [[TMP5:%.*]] = insertelement <32 x i8> [[TMP2]], i8 [[TMP4]], i64 1 +; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 +; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] +; C32-NEXT: [[TMP8:%.*]] = insertelement <32 x i8> [[TMP5]], i8 [[TMP7]], i64 2 +; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 +; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] +; C32-NEXT: [[TMP11:%.*]] = insertelement <32 x i8> [[TMP8]], i8 [[TMP10]], i64 3 +; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 +; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] +; C32-NEXT: [[TMP14:%.*]] = insertelement <32 x i8> [[TMP11]], i8 [[TMP13]], i64 4 +; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 +; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] +; C32-NEXT: [[TMP17:%.*]] = insertelement <32 x i8> [[TMP14]], i8 [[TMP16]], i64 5 +; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 +; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] +; C32-NEXT: [[TMP20:%.*]] = insertelement <32 x i8> [[TMP17]], i8 [[TMP19]], i64 6 +; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 +; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] +; C32-NEXT: [[TMP23:%.*]] = insertelement <32 x i8> [[TMP20]], i8 [[TMP22]], i64 7 +; C32-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8 +; C32-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]] +; C32-NEXT: [[TMP26:%.*]] = insertelement <32 x i8> [[TMP23]], i8 [[TMP25]], i64 8 +; C32-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9 +; C32-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]] +; C32-NEXT: [[TMP29:%.*]] = insertelement <32 x i8> [[TMP26]], i8 [[TMP28]], i64 9 +; C32-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10 +; C32-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]] +; C32-NEXT: [[TMP32:%.*]] = insertelement <32 x i8> [[TMP29]], i8 [[TMP31]], i64 10 +; C32-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11 +; C32-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]] +; C32-NEXT: [[TMP35:%.*]] = insertelement <32 x i8> [[TMP32]], i8 [[TMP34]], i64 11 +; C32-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12 +; C32-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]] +; C32-NEXT: [[TMP38:%.*]] = insertelement <32 x i8> [[TMP35]], i8 [[TMP37]], i64 12 +; C32-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13 +; C32-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]] +; C32-NEXT: [[TMP41:%.*]] = insertelement <32 x i8> [[TMP38]], i8 [[TMP40]], i64 13 +; C32-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14 +; C32-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]] +; C32-NEXT: [[TMP44:%.*]] = insertelement <32 x i8> [[TMP41]], i8 [[TMP43]], i64 14 +; C32-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15 +; C32-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]] +; C32-NEXT: [[TMP47:%.*]] = insertelement <32 x i8> [[TMP44]], i8 [[TMP46]], i64 15 +; C32-NEXT: [[TMP48:%.*]] = add i32 [[TMP0]], 16 +; C32-NEXT: [[TMP49:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP48]] +; C32-NEXT: [[TMP50:%.*]] = insertelement <32 x i8> [[TMP47]], i8 [[TMP49]], i64 16 +; C32-NEXT: [[TMP51:%.*]] = add i32 [[TMP0]], 17 +; C32-NEXT: [[TMP52:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP51]] +; C32-NEXT: [[TMP53:%.*]] = insertelement <32 x i8> [[TMP50]], i8 [[TMP52]], i64 17 +; C32-NEXT: [[TMP54:%.*]] = add i32 [[TMP0]], 18 +; C32-NEXT: [[TMP55:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP54]] +; C32-NEXT: [[TMP56:%.*]] = insertelement <32 x i8> [[TMP53]], i8 [[TMP55]], i64 18 +; C32-NEXT: [[TMP57:%.*]] = add i32 [[TMP0]], 19 +; C32-NEXT: [[TMP58:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP57]] +; C32-NEXT: [[TMP59:%.*]] = insertelement <32 x i8> [[TMP56]], i8 [[TMP58]], i64 19 +; C32-NEXT: [[TMP60:%.*]] = add i32 [[TMP0]], 20 +; C32-NEXT: [[TMP61:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP60]] +; C32-NEXT: [[TMP62:%.*]] = insertelement <32 x i8> [[TMP59]], i8 [[TMP61]], i64 20 +; C32-NEXT: [[TMP63:%.*]] = add i32 [[TMP0]], 21 +; C32-NEXT: [[TMP64:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP63]] +; C32-NEXT: [[TMP65:%.*]] = insertelement <32 x i8> [[TMP62]], i8 [[TMP64]], i64 21 +; C32-NEXT: [[TMP66:%.*]] = add i32 [[TMP0]], 22 +; C32-NEXT: [[TMP67:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP66]] +; C32-NEXT: [[TMP68:%.*]] = insertelement <32 x i8> [[TMP65]], i8 [[TMP67]], i64 22 +; C32-NEXT: [[TMP69:%.*]] = add i32 [[TMP0]], 23 +; C32-NEXT: [[TMP70:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP69]] +; C32-NEXT: [[TMP71:%.*]] = insertelement <32 x i8> [[TMP68]], i8 [[TMP70]], i64 23 +; C32-NEXT: [[TMP72:%.*]] = add i32 [[TMP0]], 24 +; C32-NEXT: [[TMP73:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP72]] +; C32-NEXT: [[TMP74:%.*]] = insertelement <32 x i8> [[TMP71]], i8 [[TMP73]], i64 24 +; C32-NEXT: [[TMP75:%.*]] = add i32 [[TMP0]], 25 +; C32-NEXT: [[TMP76:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP75]] +; C32-NEXT: [[TMP77:%.*]] = insertelement <32 x i8> [[TMP74]], i8 [[TMP76]], i64 25 +; C32-NEXT: [[TMP78:%.*]] = add i32 [[TMP0]], 26 +; C32-NEXT: [[TMP79:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP78]] +; C32-NEXT: [[TMP80:%.*]] = insertelement <32 x i8> [[TMP77]], i8 [[TMP79]], i64 26 +; C32-NEXT: [[TMP81:%.*]] = add i32 [[TMP0]], 27 +; C32-NEXT: [[TMP82:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP81]] +; C32-NEXT: [[TMP83:%.*]] = insertelement <32 x i8> [[TMP80]], i8 [[TMP82]], i64 27 +; C32-NEXT: [[TMP84:%.*]] = add i32 [[TMP0]], 28 +; C32-NEXT: [[TMP85:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP84]] +; C32-NEXT: [[TMP86:%.*]] = insertelement <32 x i8> [[TMP83]], i8 [[TMP85]], i64 28 +; C32-NEXT: [[TMP87:%.*]] = add i32 [[TMP0]], 29 +; C32-NEXT: [[TMP88:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP87]] +; C32-NEXT: [[TMP89:%.*]] = insertelement <32 x i8> [[TMP86]], i8 [[TMP88]], i64 29 +; C32-NEXT: [[TMP90:%.*]] = add i32 [[TMP0]], 30 +; C32-NEXT: [[TMP91:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP90]] +; C32-NEXT: [[TMP92:%.*]] = insertelement <32 x i8> [[TMP89]], i8 [[TMP91]], i64 30 +; C32-NEXT: [[TMP93:%.*]] = add i32 [[TMP0]], 31 +; C32-NEXT: [[TMP94:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP93]] +; C32-NEXT: [[TMP95:%.*]] = insertelement <32 x i8> [[TMP92]], i8 [[TMP94]], i64 31 +; C32-NEXT: store <32 x i8> [[TMP95]], ptr addrspace(1) [[OUT]], align 4 +; C32-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <32 x i8>, ptr addrspace(5) %alloca, i32 %idx + %vec = load <32 x i8>, ptr addrspace(5) %gep, align 4 + store <32 x i8> %vec, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) { +; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C4-NEXT: [[ENTRY:.*:]] +; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C4-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; C4-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; C4-NEXT: ret void +; +; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C8-NEXT: [[ENTRY:.*:]] +; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C8-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; C8-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; C8-NEXT: ret void +; +; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C16-NEXT: [[ENTRY:.*:]] +; C16-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C16-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C16-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; C16-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; C16-NEXT: ret void +; +; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( +; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; C32-NEXT: [[ENTRY:.*:]] +; C32-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) +; C32-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] +; C32-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 +; C32-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 +; C32-NEXT: ret void +; +entry: + %alloca = alloca [64 x i8], align 4, addrspace(5) + %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx + %gepint = ptrtoint ptr addrspace(5) %gep to i64 + store i64 %gepint, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll index caab29b58c13f..85a987f7c3a28 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -252,84 +252,6 @@ bb2: store i32 0, ptr addrspace(5) %extractelement ret void } - -define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) { -; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8( -; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] -; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4 -; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void -; -entry: - %alloca = alloca [64 x i8], align 4, addrspace(5) - %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx - %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4 - store <16 x i8> %vec, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) { -; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8( -; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison -; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5 -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7 -; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void -; -entry: - %alloca = alloca [64 x i8], align 4, addrspace(5) - %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx - %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4 - store <8 x i8> %vec, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) { -; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload( -; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]] -; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64 -; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: ret void -; -entry: - %alloca = alloca [64 x i8], align 4, addrspace(5) - %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx - %gepint = ptrtoint ptr addrspace(5) %gep to i64 - store i64 %gepint, ptr addrspace(1) %out, align 4 - ret void -} - - ;. ; CHECK: [[META0]] = !{} ; CHECK: [[RNG1]] = !{i32 0, i32 1025} From 8700e910df7dae1e71d6313c8c2ab1a3c9894a2f Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 14:07:10 -0600 Subject: [PATCH 09/12] format --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 3b656480770b6..b1bb1ae27e77a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -931,8 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { GEP->users(), [&](const auto *U) { return isa(U); }); if (auto *UserVecTy = dyn_cast(GEP->getSourceElementType())) { - if (UsedInLoad && - UserVecTy->getNumElements() > PromoteAllocaDynamicIndexNumberElementLimit) { + if (UsedInLoad && UserVecTy->getNumElements() > + PromoteAllocaDynamicIndexNumberElementLimit) { return RejectUser(Inst, "user has too many elements for dynamic index"); } From 9d7b94bccac8c94388951e44df93f4fe48b5ab4a Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 14:17:14 -0600 Subject: [PATCH 10/12] remove comment --- llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll index 85a987f7c3a28..76e1868b3c4b9 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -3,8 +3,6 @@ ; Check that invalid IR is not produced on a vector typed ; getelementptr with a scalar alloca pointer base. -; Also check if GEP with dynamic index is rejected above -; threshold # of elements. define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { From b7c57d46ac0e25896de833e9e36cad402b67d2f7 Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 23:21:33 -0600 Subject: [PATCH 11/12] Look at User LoadInst's type --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index b1bb1ae27e77a..c300d959f5b41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -926,17 +926,21 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); - if (!isa(Index)) { - bool UsedInLoad = llvm::any_of( - GEP->users(), [&](const auto *U) { return isa(U); }); - if (auto *UserVecTy = - dyn_cast(GEP->getSourceElementType())) { - if (UsedInLoad && UserVecTy->getNumElements() > - PromoteAllocaDynamicIndexNumberElementLimit) { - return RejectUser(Inst, - "user has too many elements for dynamic index"); + auto Predicate = [&](const User *U) -> bool { + if (auto *LI = dyn_cast(U)) { + if (auto *LoadVecTy = dyn_cast(LI->getType())) { + if (LoadVecTy->getNumElements() > + PromoteAllocaDynamicIndexNumberElementLimit) + return true; } } + return false; + }; + if (!isa(Index)) { + if (llvm::any_of(GEP->users(), Predicate)) { + return RejectUser(Inst, + "user has too many elements for dynamic index"); + } } GEPVectorIdx[GEP] = Index; From c0fd36f4162f3ed16994f81e3230ee3b765bbf7a Mon Sep 17 00:00:00 2001 From: Kevin Choi Date: Wed, 3 Dec 2025 23:23:51 -0600 Subject: [PATCH 12/12] format --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index c300d959f5b41..7587b46433b96 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -930,7 +930,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *LI = dyn_cast(U)) { if (auto *LoadVecTy = dyn_cast(LI->getType())) { if (LoadVecTy->getNumElements() > - PromoteAllocaDynamicIndexNumberElementLimit) + PromoteAllocaDynamicIndexNumberElementLimit) return true; } }