-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Enable i8 GEP promotion for vector allocas #166132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
a69372f
6a28740
19584ca
2573066
8c3f2e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -456,10 +456,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, | |||||||||||||||
| const auto &VarOffset = VarOffsets.front(); | ||||||||||||||||
| APInt OffsetQuot; | ||||||||||||||||
| APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); | ||||||||||||||||
| if (Rem != 0 || OffsetQuot.isZero()) | ||||||||||||||||
| return nullptr; | ||||||||||||||||
|
|
||||||||||||||||
| Value *Offset = VarOffset.first; | ||||||||||||||||
| if (Rem != 0 || OffsetQuot.isZero()) { | ||||||||||||||||
| unsigned ElemSizeShift = Log2_64(VecElemSize); | ||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to validate VecElemSize is a power of 2?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, but I think it is not necessary to explicitly check whether the element size is a power of two, because it is already covered by the existing check here: llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Lines 871 to 877 in 52fdcf9
If the element type is not naturally aligned, it will return false, which also rejects non power of 2 element sizes, such as i24. |
||||||||||||||||
| SimplifyQuery SQ(DL); | ||||||||||||||||
| SQ.CxtI = GEP; | ||||||||||||||||
| KnownBits KB = computeKnownBits(VarOffset.first, SQ); | ||||||||||||||||
| // Bail out if the index may point into the middle of an element. | ||||||||||||||||
| if (KB.countMinTrailingZeros() < ElemSizeShift) | ||||||||||||||||
| return nullptr; | ||||||||||||||||
|
|
||||||||||||||||
| Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift); | ||||||||||||||||
| if (Instruction *NewInst = dyn_cast<Instruction>(Scaled)) | ||||||||||||||||
| NewInsts.push_back(NewInst); | ||||||||||||||||
|
|
||||||||||||||||
| Offset = Scaled; | ||||||||||||||||
| OffsetQuot = APInt(BW, 1); | ||||||||||||||||
| Rem = 0; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| auto *OffsetType = dyn_cast<IntegerType>(Offset->getType()); | ||||||||||||||||
| if (!OffsetType) | ||||||||||||||||
| return nullptr; | ||||||||||||||||
|
|
||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -250,6 +250,88 @@ bb2: | |
| store i32 0, ptr addrspace(5) %extractelement | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] | ||
| ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 0, i32 4 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] | ||
| ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 4, i32 8 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]] | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]] | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() | ||
| ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() | ||
| ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] | ||
| ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]] | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5 | ||
| ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]] | ||
| ; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4 | ||
| ; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 4, i32 5 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test with non-power-of-2 element size |
||
| ;. | ||
| ; CHECK: [[META0]] = !{} | ||
| ; CHECK: [[RNG1]] = !{i32 0, i32 1025} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it have to be this complicated? I thought checking whether
offset % sizewould be sufficient?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I agree your points, I have removed it.