Skip to content

Commit f5d7761

Browse files
committed
[AMDGPU] Add support for store to constant address space
Since we don't stores to the constant address space as IR verifier errors, we need to support their lowering. This PR supports that by treating such stores as no-ops: in the combiner, the store node is simply replaced with its chain. Fixes SWDEV-499366.
1 parent e04feda commit f5d7761

File tree

3 files changed

+189
-11
lines changed

3 files changed

+189
-11
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,9 @@ def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
450450
def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global,
451451
AddrSpaces.Constant,
452452
AddrSpaces.Constant32Bit ]>;
453-
def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
453+
def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global,
454+
AddrSpaces.Constant,
455+
AddrSpaces.Constant32Bit ]>;
454456

455457
def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
456458
AddrSpaces.Global,

llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll

Lines changed: 0 additions & 10 deletions
This file was deleted.
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
3+
4+
; FIXME: We need to test AS6 but the AS6 variants of the following tests fail because of illegal VGPR to SGPR copy.
5+
; FIXME: We also want to test memset, memcpy, and memmove, but it needs to fix the SelectionDAG store merging issue (#90714).
6+
7+
define amdgpu_kernel void @store_as4_i8(ptr addrspace(4) %p, i8 %v) {
8+
; CHECK-LABEL: store_as4_i8:
9+
; CHECK: ; %bb.0:
10+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
11+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
12+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
13+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
14+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
15+
; CHECK-NEXT: global_store_byte v0, v1, s[0:1]
16+
; CHECK-NEXT: s_endpgm
17+
store i8 %v, ptr addrspace(4) %p
18+
ret void
19+
}
20+
21+
define amdgpu_kernel void @store_as4_i16(ptr addrspace(4) %p, i16 %v) {
22+
; CHECK-LABEL: store_as4_i16:
23+
; CHECK: ; %bb.0:
24+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
25+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
26+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
27+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
28+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
29+
; CHECK-NEXT: global_store_short v0, v1, s[0:1]
30+
; CHECK-NEXT: s_endpgm
31+
store i16 %v, ptr addrspace(4) %p
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @store_as4_i32(ptr addrspace(4) %p, i32 %v) {
36+
; CHECK-LABEL: store_as4_i32:
37+
; CHECK: ; %bb.0:
38+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
39+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
40+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
41+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
42+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
43+
; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
44+
; CHECK-NEXT: s_endpgm
45+
store i32 %v, ptr addrspace(4) %p
46+
ret void
47+
}
48+
49+
define amdgpu_kernel void @store_as4_i64(ptr addrspace(4) %p, i64 %v) {
50+
; CHECK-LABEL: store_as4_i64:
51+
; CHECK: ; %bb.0:
52+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
53+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
54+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
55+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
56+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
57+
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
58+
; CHECK-NEXT: s_endpgm
59+
store i64 %v, ptr addrspace(4) %p
60+
ret void
61+
}
62+
63+
define amdgpu_kernel void @store_as4_float(ptr addrspace(4) %p, float %v) {
64+
; CHECK-LABEL: store_as4_float:
65+
; CHECK: ; %bb.0:
66+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
67+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
68+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
69+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
70+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
71+
; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
72+
; CHECK-NEXT: s_endpgm
73+
store float %v, ptr addrspace(4) %p
74+
ret void
75+
}
76+
77+
define amdgpu_kernel void @store_as4_double(ptr addrspace(4) %p, double %v) {
78+
; CHECK-LABEL: store_as4_double:
79+
; CHECK: ; %bb.0:
80+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
81+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
82+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
83+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
84+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
85+
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
86+
; CHECK-NEXT: s_endpgm
87+
store double %v, ptr addrspace(4) %p
88+
ret void
89+
}
90+
91+
define amdgpu_kernel void @store_as4_half(ptr addrspace(4) %p, half %v) {
92+
; CHECK-LABEL: store_as4_half:
93+
; CHECK: ; %bb.0:
94+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
95+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
96+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
97+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
98+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
99+
; CHECK-NEXT: global_store_short v0, v1, s[0:1]
100+
; CHECK-NEXT: s_endpgm
101+
store half %v, ptr addrspace(4) %p
102+
ret void
103+
}
104+
105+
define amdgpu_kernel void @store_as4_2xi8(ptr addrspace(4) %p, <2 x i8> %v) {
106+
; CHECK-LABEL: store_as4_2xi8:
107+
; CHECK: ; %bb.0:
108+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
109+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
110+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
111+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
112+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
113+
; CHECK-NEXT: global_store_short v0, v1, s[0:1]
114+
; CHECK-NEXT: s_endpgm
115+
store <2 x i8> %v, ptr addrspace(4) %p
116+
ret void
117+
}
118+
119+
define amdgpu_kernel void @store_as4_2xi16(ptr addrspace(4) %p, <2 x i16> %v) {
120+
; CHECK-LABEL: store_as4_2xi16:
121+
; CHECK: ; %bb.0:
122+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
123+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
124+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
125+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
126+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
127+
; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
128+
; CHECK-NEXT: s_endpgm
129+
store <2 x i16> %v, ptr addrspace(4) %p
130+
ret void
131+
}
132+
133+
define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) {
134+
; CHECK-LABEL: store_as4_2xi32:
135+
; CHECK: ; %bb.0:
136+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
137+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
138+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
139+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
140+
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
141+
; CHECK-NEXT: s_endpgm
142+
store <2 x i32> %v, ptr addrspace(4) %p
143+
ret void
144+
}
145+
146+
define amdgpu_kernel void @store_as4_2xhalf(ptr addrspace(4) %p, <2 x half> %v) {
147+
; CHECK-LABEL: store_as4_2xhalf:
148+
; CHECK: ; %bb.0:
149+
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
150+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
151+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
152+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
153+
; CHECK-NEXT: v_mov_b32_e32 v1, s2
154+
; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
155+
; CHECK-NEXT: s_endpgm
156+
store <2 x half> %v, ptr addrspace(4) %p
157+
ret void
158+
}
159+
160+
define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v) {
161+
; CHECK-LABEL: store_as4_2xfloat:
162+
; CHECK: ; %bb.0:
163+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
164+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
165+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
166+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
167+
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
168+
; CHECK-NEXT: s_endpgm
169+
store <2 x float> %v, ptr addrspace(4) %p
170+
ret void
171+
}
172+
173+
define amdgpu_kernel void @store_as4_2xdouble(ptr addrspace(4) %p, <2 x double> %v) {
174+
; CHECK-LABEL: store_as4_2xdouble:
175+
; CHECK: ; %bb.0:
176+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
177+
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
178+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
179+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
180+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
181+
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
182+
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
183+
; CHECK-NEXT: s_endpgm
184+
store <2 x double> %v, ptr addrspace(4) %p
185+
ret void
186+
}

0 commit comments

Comments
 (0)