-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Add support for store to constant address space #153835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesSince we don't stores to the constant address space as IR verifier errors, we need to support their lowering. This PR supports that by treating such stores as no-ops: in the combiner, the store node is simply replaced with its chain. Fixes SWDEV-499366. Full diff: https://github.com/llvm/llvm-project/pull/153835.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab7d753c..3f44559e07756 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3910,10 +3910,14 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// type.
SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ StoreSDNode *SN = cast<StoreSDNode>(N);
+ unsigned AS = SN->getAddressSpace();
+ if (AMDGPU::isConstantAddressSpace(AS))
+ return SN->getChain();
+
if (!DCI.isBeforeLegalize())
return SDValue();
- StoreSDNode *SN = cast<StoreSDNode>(N);
if (!SN->isSimple() || !ISD::isNormalStore(SN))
return SDValue();
@@ -3925,7 +3929,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
Align Alignment = SN->getAlign();
if (Alignment < Size && isTypeLegal(VT)) {
unsigned IsFast;
- unsigned AS = SN->getAddressSpace();
// Expand unaligned stores earlier than legalization. Due to visitation
// order problems during legalization, the emitted instructions to pack and
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
deleted file mode 100644
index 0bfc45c84b0c4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL %s
-
-; SDAG: LLVM ERROR: Cannot select: {{[a-z0-9]+}}: ch = store<(store (s32) into %ir.ptr.load, addrspace 4)>
-; GISEL: LLVM ERROR: cannot select: G_STORE %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr(p4) :: (store (s32) into %ir.ptr.load, addrspace 4) (in function: store_to_constant_i32)
-define amdgpu_kernel void @store_to_constant_i32(ptr addrspace(4) %ptr) {
-bb:
- store i32 1, ptr addrspace(4) %ptr, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
new file mode 100644
index 0000000000000..77afbf8355f2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define amdgpu_kernel void @store_as4(ptr addrspace(4) %out, i32 %a, i32 %b) {
+; CHECK-LABEL: store_as4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ %r = add i32 %a, %b
+ store i32 %r, ptr addrspace(4) %out
+ ret void
+}
+
+define amdgpu_kernel void @memset_as4(ptr addrspace(4) %dst) {
+; CHECK-LABEL: memset_as4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ call void @llvm.memset.p4.i64(ptr addrspace(4) %dst, i8 0, i64 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memcpy_to_as4(ptr addrspace(4) %dst, ptr %src) {
+; CHECK-LABEL: memcpy_to_as4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ call void @llvm.memcpy.p4.p0.i32(ptr addrspace(4) %dst, ptr %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @store_as6(ptr addrspace(6) %out, i32 %a, i32 %b) {
+; CHECK-LABEL: store_as6:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ %r = add i32 %a, %b
+ store i32 %r, ptr addrspace(6) %out
+ ret void
+}
+
+define amdgpu_kernel void @memset_as6(ptr addrspace(6) %dst) {
+; CHECK-LABEL: memset_as6:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ call void @llvm.memset.p6.i64(ptr addrspace(6) %dst, i8 0, i64 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memcpy_to_as6(ptr addrspace(6) %dst, ptr %src) {
+; CHECK-LABEL: memcpy_to_as6:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ call void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) %dst, ptr %src, i32 256, i1 false)
+ ret void
+}
+
+; define amdgpu_kernel void @cmpxchg_to_as4(ptr addrspace(4) %dst, i32 %src) {
+; %void = cmpxchg ptr addrspace(4) %dst, i32 0, i32 %src seq_cst monotonic
+; ret void
+; }
+
+; define amdgpu_kernel void @atomicrmw_to_as4(ptr addrspace(4) %dst, i32 %src) {
+; %void = atomicrmw add ptr addrspace(4) %dst, i32 %src acquire
+; ret void
+; }
+
+; define amdgpu_kernel void @cmpxchg_to_as6(ptr addrspace(6) %dst, i32 %src) {
+; %void = cmpxchg ptr addrspace(6) %dst, i32 0, i32 %src seq_cst monotonic
+; ret void
+; }
+
+; define amdgpu_kernel void @atomicrmw_to_as6(ptr addrspace(6) %dst, i32 %src) {
+; %void = atomicrmw add ptr addrspace(6) %dst, i32 %src acquire
+; ret void
+; }
+
+declare void @llvm.memset.p4.i64(ptr addrspace(4) noalias nocapture writeonly, i8, i64, i1)
+declare void @llvm.memset.p6.i64(ptr addrspace(6) noalias nocapture writeonly, i8, i64, i1)
+declare void @llvm.memcpy.p4.p0.i32(ptr addrspace(4) noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1)
|
|
This is marked as WIP at this moment because I have not done the GISel part. I'm not sure if the combiner is the right way to do this, but I don't think we can and should do this in the pattern matching as well. On the other hand, if this approach is fine, I'll also do the atomic. |
|
Is this a good idea? I think we should just report_fatal_error. |
|
It might not be a good idea to report this as a backend fatal error, IMHO. If we think it should be treated as an error, it needs to be checked in the IR verifier. If not, then we should support it. @arsenm mentioned that a constant AS isn't fundamentally different from global AS, so we should support it, though the only difference is that the semantics make it effectively do nothing. Interestingly, @arsenm preferred to make it an IR verifier error in 788f94f. |
No, this should not be a fatal error. This is just undefined behavior that happens to be statically known. Treating it as invalid IR places a large burden on any pass that does some form of value propagation that makes an originally UB source obvious
That's not an IR verifier error |
|
I think clang shall refuse it. It is like a store to const, just like to constant. |
Clang doesn't know it's a constant store. You can always cast the pointer and access it somewhere else |
|
Is the approach in this PR the right direction then? |
deb7416 to
82947b1
Compare
| AddrSpaces.Constant, | ||
| AddrSpaces.Constant32Bit ]>; | ||
| def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; | ||
| def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can just get rid of the distinction between the load and store address spaces and use one list
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically we can but it will need a refinement of where it is used. We could do that as a follow-up if we want.
| case AMDGPUAS::GLOBAL_ADDRESS: | ||
| case AMDGPUAS::FLAT_ADDRESS: | ||
| case AMDGPUAS::CONSTANT_ADDRESS: | ||
| case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||
| return MemVT.getSizeInBits() <= 4 * 32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it really worth doing this optimization? I'd leave this for later anyway, we should also cover the unknown address spaces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is required for supporting memset (or maybe even memcpy, as well as the memmov you suggested) case; otherwise the combiner would generate a store of s1024. If we don't want to touch this part, we also need to remove the test case. I'm fine either way. WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default return should not be true (i.e. #90714)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed this part, as well as the test cases. I put a FIXME there and we can revisit this after that is fixed.
Since we don't stores to the constant address space as IR verifier errors, we need to support their lowering. This PR supports that by treating such stores as no-ops: in the combiner, the store node is simply replaced with its chain. Fixes SWDEV-499366.
82947b1 to
f5d7761
Compare
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. In follow upshould bring drop the store list, and make sure this works for atomics and memory intrinsics
| ; CHECK-NEXT: s_endpgm | ||
| store <2 x double> %v, ptr addrspace(4) %p | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also make sure atomicrmw, cmpxchg, and memory intrinsics work

Since we don't stores to the constant address space as IR verifier errors, we need to support their lowering.
Fixes SWDEV-499366.