Skip to content

Commit 82947b1

Browse files
committed
[AMDGPU] Add support for store to constant address space
Since we don't stores to the constant address space as IR verifier errors, we need to support their lowering. This PR supports that by treating such stores as no-ops: in the combiner, the store node is simply replaced with its chain. Fixes SWDEV-499366.
1 parent a4cff34 commit 82947b1

File tree

4 files changed

+139
-18
lines changed

4 files changed

+139
-18
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,9 @@ def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
450450
def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global,
451451
AddrSpaces.Constant,
452452
AddrSpaces.Constant32Bit ]>;
453-
def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
453+
def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global,
454+
AddrSpaces.Constant,
455+
AddrSpaces.Constant32Bit ]>;
454456

455457
def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
456458
AddrSpaces.Global,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1873,15 +1873,23 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
18731873

18741874
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
18751875
const MachineFunction &MF) const {
1876-
if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1877-
return (MemVT.getSizeInBits() <= 4 * 32);
1878-
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1876+
switch (AS) {
1877+
default:
1878+
return true;
1879+
case AMDGPUAS::GLOBAL_ADDRESS:
1880+
case AMDGPUAS::FLAT_ADDRESS:
1881+
case AMDGPUAS::CONSTANT_ADDRESS:
1882+
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
1883+
return MemVT.getSizeInBits() <= 4 * 32;
1884+
case AMDGPUAS::PRIVATE_ADDRESS: {
18791885
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1880-
return (MemVT.getSizeInBits() <= MaxPrivateBits);
1886+
return MemVT.getSizeInBits() <= MaxPrivateBits;
18811887
}
1882-
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1883-
return (MemVT.getSizeInBits() <= 2 * 32);
1884-
return true;
1888+
case AMDGPUAS::LOCAL_ADDRESS:
1889+
case AMDGPUAS::REGION_ADDRESS:
1890+
return MemVT.getSizeInBits() <= 2 * 32;
1891+
}
1892+
llvm_unreachable("this should not be reached");
18851893
}
18861894

18871895
bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(

llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll

Lines changed: 0 additions & 10 deletions
This file was deleted.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
3+
4+
define amdgpu_kernel void @store_as4(ptr addrspace(4) %out, i32 %a, i32 %b) {
5+
; CHECK-LABEL: store_as4:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
8+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
9+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
10+
; CHECK-NEXT: s_add_i32 s2, s2, s3
11+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
12+
; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
13+
; CHECK-NEXT: s_endpgm
14+
%r = add i32 %a, %b
15+
store i32 %r, ptr addrspace(4) %out
16+
ret void
17+
}
18+
19+
define amdgpu_kernel void @memset_as4(ptr addrspace(4) %dst) {
20+
; CHECK-LABEL: memset_as4:
21+
; CHECK: ; %bb.0:
22+
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
23+
; CHECK-NEXT: s_mov_b32 s0, 0
24+
; CHECK-NEXT: s_mov_b32 s1, s0
25+
; CHECK-NEXT: s_mov_b32 s2, s0
26+
; CHECK-NEXT: s_mov_b32 s3, s0
27+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
28+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
29+
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
30+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
31+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:240
32+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:224
33+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:208
34+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:192
35+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:176
36+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:160
37+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:144
38+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:128
39+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:112
40+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:96
41+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:80
42+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:64
43+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:48
44+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:32
45+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:16
46+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
47+
; CHECK-NEXT: s_endpgm
48+
call void @llvm.memset.p4.i64(ptr addrspace(4) %dst, i8 0, i64 256, i1 false)
49+
ret void
50+
}
51+
52+
define amdgpu_kernel void @memcpy_to_as4(ptr addrspace(4) %dst, ptr %src) {
53+
; CHECK-LABEL: memcpy_to_as4:
54+
; CHECK: ; %bb.0:
55+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
56+
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
57+
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
58+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
59+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
60+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
61+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:112
62+
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:96
63+
; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:80
64+
; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:64
65+
; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:48
66+
; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:32
67+
; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:16
68+
; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[0:1]
69+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
70+
; CHECK-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] offset:112
71+
; CHECK-NEXT: global_store_dwordx4 v2, v[8:11], s[0:1] offset:96
72+
; CHECK-NEXT: global_store_dwordx4 v2, v[12:15], s[0:1] offset:80
73+
; CHECK-NEXT: global_store_dwordx4 v2, v[16:19], s[0:1] offset:64
74+
; CHECK-NEXT: global_store_dwordx4 v2, v[20:23], s[0:1] offset:48
75+
; CHECK-NEXT: global_store_dwordx4 v2, v[24:27], s[0:1] offset:32
76+
; CHECK-NEXT: global_store_dwordx4 v2, v[28:31], s[0:1] offset:16
77+
; CHECK-NEXT: global_store_dwordx4 v2, v[32:35], s[0:1]
78+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:240
79+
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:224
80+
; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:208
81+
; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:192
82+
; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:176
83+
; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:160
84+
; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:144
85+
; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:128
86+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87+
; CHECK-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] offset:240
88+
; CHECK-NEXT: global_store_dwordx4 v2, v[8:11], s[0:1] offset:224
89+
; CHECK-NEXT: global_store_dwordx4 v2, v[12:15], s[0:1] offset:208
90+
; CHECK-NEXT: global_store_dwordx4 v2, v[16:19], s[0:1] offset:192
91+
; CHECK-NEXT: global_store_dwordx4 v2, v[20:23], s[0:1] offset:176
92+
; CHECK-NEXT: global_store_dwordx4 v2, v[24:27], s[0:1] offset:160
93+
; CHECK-NEXT: global_store_dwordx4 v2, v[28:31], s[0:1] offset:144
94+
; CHECK-NEXT: global_store_dwordx4 v2, v[32:35], s[0:1] offset:128
95+
; CHECK-NEXT: s_endpgm
96+
call void @llvm.memcpy.p4.p0.i32(ptr addrspace(4) %dst, ptr %src, i32 256, i1 false)
97+
ret void
98+
}
99+
100+
; FIXME: All the following AS6 test cases fail because of illegal VGPR to SGPR copy.
101+
102+
; define amdgpu_kernel void @store_as6(ptr addrspace(6) %out, i32 %a, i32 %b) {
103+
; %r = add i32 %a, %b
104+
; store i32 %r, ptr addrspace(6) %out
105+
; ret void
106+
; }
107+
108+
; define amdgpu_kernel void @memset_as6(ptr addrspace(6) %dst) {
109+
; call void @llvm.memset.p6.i64(ptr addrspace(6) %dst, i8 0, i64 256, i1 false)
110+
; ret void
111+
; }
112+
113+
; define amdgpu_kernel void @memcpy_to_as6(ptr addrspace(6) %dst, ptr %src) {
114+
; call void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) %dst, ptr %src, i32 256, i1 false)
115+
; ret void
116+
; }
117+
118+
declare void @llvm.memset.p4.i64(ptr addrspace(4) noalias nocapture writeonly, i8, i64, i1)
119+
declare void @llvm.memset.p6.i64(ptr addrspace(6) noalias nocapture writeonly, i8, i64, i1)
120+
declare void @llvm.memcpy.p4.p0.i32(ptr addrspace(4) noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1)
121+
declare void @llvm.memcpy.p6.p0.i32(ptr addrspace(6) noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1)

0 commit comments

Comments
 (0)