-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Promote nestedGEP allocas to vectors #141199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2e58d7e
30bb8c8
6364294
9d1307f
c2925c8
60f3bac
e6b3e83
4d89174
3420393
3b7b814
160723a
4c95ac1
311290a
3735fa3
2ece81d
7ad2ee2
f2efffb
8e400b9
872663b
a23e944
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -437,9 +437,34 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, | |
| unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); | ||
| SmallMapVector<Value *, APInt, 4> VarOffsets; | ||
| APInt ConstOffset(BW, 0); | ||
| if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || | ||
| !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) | ||
| return nullptr; | ||
|
|
||
| // Walk backwards through nested GEPs to collect both constant and variable | ||
| // offsets, so that nested vector GEP chains can be lowered in one step. | ||
| // | ||
| // Given this IR fragment as input: | ||
| // | ||
| // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5) | ||
| // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j | ||
| // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4 | ||
| // %3 = load i32, ptr addrspace(5) %2, align 4 | ||
| // | ||
| // Combine both GEP operations in a single pass, producing: | ||
| // BasePtr = %0 | ||
| // ConstOffset = 4 | ||
| // VarOffsets = { %j -> element_size(<2 x i32>) } | ||
| // | ||
| // That lets us emit a single buffer_load directly into a VGPR, without ever | ||
| // allocating scratch memory for the intermediate pointer. | ||
| Value *CurPtr = GEP; | ||
| while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) { | ||
| if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) | ||
| return nullptr; | ||
|
|
||
| // Move to the next outer pointer. | ||
| CurPtr = CurGEP->getPointerOperand(); | ||
| } | ||
|
|
||
| assert(CurPtr == Alloca && "GEP not based on alloca"); | ||
|
||
|
|
||
| unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy); | ||
| if (VarOffsets.size() > 1) | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s | ||
| define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %idx, ptr addrspace(1) %output) #0 { | ||
| ; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep( | ||
| ; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP1]], i32 2, i32 [[TMP2]] | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]] | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP5]] | ||
| ; CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(1) [[OUTPUT]], align 4 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| entry: | ||
| %alloca = alloca [10 x <2 x i32>], align 8, addrspace(5) | ||
| %row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %idx | ||
| store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8 | ||
| %elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4 | ||
| %val = load i32, ptr addrspace(5) %elt1, align 4 | ||
| store i32 %val, ptr addrspace(1) %output | ||
| ret void | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to make this not dead code. Also use named values in tests
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have updated it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is still dead code
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean not use alloca after load? right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understood your mind! Thanks !
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have updated it, what do you think about it? :-)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "dead code" in this case refers to the fact that this alloca has no stores, no valid data.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I have updated it, and I inserted valid data to alloca, what do you think about it? |
||
| } | ||
|
|
||
| define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(i32 %idx, ptr addrspace(1) %output) #0 { | ||
| ; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3( | ||
| ; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <16 x i32> poison | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = add i32 8, [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[ALLOCA]], i32 10, i32 [[TMP1]] | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], 1 | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP2]], i32 20, i32 [[TMP3]] | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = add i32 9, [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP4]], i32 [[TMP6]] | ||
| ; CHECK-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[OUTPUT]], align 4 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| entry: | ||
| %alloca = alloca [2 x [4 x <2 x i32>]], align 8, addrspace(5) | ||
| %lvl1 = getelementptr inbounds [2 x [4 x <2 x i32>]], ptr addrspace(5) %alloca, i32 0, i32 1 | ||
| %lvl2 = getelementptr inbounds [4 x <2 x i32>], ptr addrspace(5) %lvl1, i32 0, i32 %idx | ||
| store <2 x i32> <i32 10, i32 20>, ptr addrspace(5) %lvl2, align 8 | ||
| %byte = getelementptr inbounds i8, ptr addrspace(5) %lvl2, i32 4 | ||
| %val = load i32, ptr addrspace(5) %byte, align 4 | ||
| store i32 %val, ptr addrspace(1) %output | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(i32 %idx, ptr addrspace(1) %out) #0 { | ||
| ; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx( | ||
| ; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 1 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP4]], i32 2, i32 [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IDX]], 3 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP1]] | ||
| ; CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[OUT]], align 4 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| entry: | ||
| %alloca = alloca [10 x [2 x i32]], align 8, addrspace(5) | ||
| %row = getelementptr inbounds [10 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 %idx | ||
| store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8 | ||
| %elt = getelementptr inbounds [2 x i32], ptr addrspace(5) %row, i32 0, i32 %idx | ||
| %val = load i32, ptr addrspace(5) %elt, align 4 | ||
| store i32 %val, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure this is safe to assert?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am sure it is safe ! Because it is from the following code:
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines 364 to 370 in c3656af
So it is must alloca type.