Skip to content

Commit 4027426

Browse files
committed
[AMDGPU][GlobalISel] Add register bank legalization for G_AMDGPU_BUFFER_LOAD_BYTE_AND_SHORT
1 parent b6bcfde commit 4027426

File tree

6 files changed

+208
-8
lines changed

6 files changed

+208
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
890890
.Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
891891
.Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
892892

893+
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
894+
G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
895+
StandardB)
896+
.Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
897+
.Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
898+
893899
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
894900
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
895901

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -new-reg-bank-select -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
3+
4+
define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
5+
; GFX12-LABEL: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext:
6+
; GFX12: ; %bb.0:
7+
; GFX12-NEXT: buffer_load_u8 v0, v0, s[0:3], s4 offen
8+
; GFX12-NEXT: s_wait_loadcnt 0x0
9+
; GFX12-NEXT: ; return to shader part epilog
10+
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
11+
%zext = zext i8 %val to i32
12+
%cast = bitcast i32 %zext to float
13+
ret float %cast
14+
}
15+
16+
define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
17+
; GFX12-LABEL: raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext:
18+
; GFX12: ; %bb.0:
19+
; GFX12-NEXT: buffer_load_i8 v0, v0, s[0:3], s4 offen
20+
; GFX12-NEXT: s_wait_loadcnt 0x0
21+
; GFX12-NEXT: ; return to shader part epilog
22+
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
23+
%sext = sext i8 %val to i32
24+
%cast = bitcast i32 %sext to float
25+
ret float %cast
26+
}
27+
28+
define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
29+
; GFX12-LABEL: raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext:
30+
; GFX12: ; %bb.0:
31+
; GFX12-NEXT: buffer_load_u16 v0, v0, s[0:3], s4 offen
32+
; GFX12-NEXT: s_wait_loadcnt 0x0
33+
; GFX12-NEXT: ; return to shader part epilog
34+
%val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
35+
%zext = zext i16 %val to i32
36+
%cast = bitcast i32 %zext to float
37+
ret float %cast
38+
}
39+
40+
define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
41+
; GFX12-LABEL: raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext:
42+
; GFX12: ; %bb.0:
43+
; GFX12-NEXT: buffer_load_i16 v0, v0, s[0:3], s4 offen
44+
; GFX12-NEXT: s_wait_loadcnt 0x0
45+
; GFX12-NEXT: ; return to shader part epilog
46+
%val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
47+
%sext = sext i16 %val to i32
48+
%cast = bitcast i32 %sext to float
49+
ret float %cast
50+
}
51+
52+
define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) {
53+
; GFX12-LABEL: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset_zext:
54+
; GFX12: ; %bb.0:
55+
; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
56+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
57+
; GFX12-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
58+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
59+
; GFX12-NEXT: v_readfirstlane_b32 s4, v5
60+
; GFX12-NEXT: s_wait_loadcnt 0x0
61+
; GFX12-NEXT: v_readfirstlane_b32 s5, v6
62+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
63+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
64+
; GFX12-NEXT: s_wait_alu 0xf1ff
65+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
66+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
67+
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
68+
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
69+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
70+
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
71+
; GFX12-NEXT: buffer_load_u8 v6, v4, s[4:7], s0 offen
72+
; GFX12-NEXT: ; implicit-def: $vgpr5
73+
; GFX12-NEXT: ; implicit-def: $vgpr4
74+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
75+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
76+
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
77+
; GFX12-NEXT: ; %bb.2:
78+
; GFX12-NEXT: s_mov_b32 exec_lo, s2
79+
; GFX12-NEXT: s_wait_loadcnt 0x0
80+
; GFX12-NEXT: v_mov_b32_e32 v0, v6
81+
; GFX12-NEXT: ; return to shader part epilog
82+
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
83+
%zext = zext i8 %val to i32
84+
%cast = bitcast i32 %zext to float
85+
ret float %cast
86+
}
87+
88+
define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) {
89+
; GFX12-LABEL: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset_sext:
90+
; GFX12: ; %bb.0:
91+
; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
92+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
93+
; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
94+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
95+
; GFX12-NEXT: v_readfirstlane_b32 s4, v5
96+
; GFX12-NEXT: s_wait_loadcnt 0x0
97+
; GFX12-NEXT: v_readfirstlane_b32 s5, v6
98+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
99+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
100+
; GFX12-NEXT: s_wait_alu 0xf1ff
101+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
102+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
103+
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
104+
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
105+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
106+
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
107+
; GFX12-NEXT: buffer_load_i8 v6, v4, s[4:7], s0 offen
108+
; GFX12-NEXT: ; implicit-def: $vgpr5
109+
; GFX12-NEXT: ; implicit-def: $vgpr4
110+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
111+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
112+
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
113+
; GFX12-NEXT: ; %bb.2:
114+
; GFX12-NEXT: s_mov_b32 exec_lo, s2
115+
; GFX12-NEXT: s_wait_loadcnt 0x0
116+
; GFX12-NEXT: v_mov_b32_e32 v0, v6
117+
; GFX12-NEXT: ; return to shader part epilog
118+
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
119+
%sext = sext i8 %val to i32
120+
%cast = bitcast i32 %sext to float
121+
ret float %cast
122+
}
123+
124+
define amdgpu_ps float @raw_buffer_load_i16__vgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) {
125+
; GFX12-LABEL: raw_buffer_load_i16__vgpr_rsrc__vgpr_voffset__sgpr_soffset_zext:
126+
; GFX12: ; %bb.0:
127+
; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
128+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
129+
; GFX12-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
130+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
131+
; GFX12-NEXT: v_readfirstlane_b32 s4, v5
132+
; GFX12-NEXT: s_wait_loadcnt 0x0
133+
; GFX12-NEXT: v_readfirstlane_b32 s5, v6
134+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
135+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
136+
; GFX12-NEXT: s_wait_alu 0xf1ff
137+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
138+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
139+
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
140+
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
141+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
142+
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
143+
; GFX12-NEXT: buffer_load_u16 v6, v4, s[4:7], s0 offen
144+
; GFX12-NEXT: ; implicit-def: $vgpr5
145+
; GFX12-NEXT: ; implicit-def: $vgpr4
146+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
147+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
148+
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
149+
; GFX12-NEXT: ; %bb.2:
150+
; GFX12-NEXT: s_mov_b32 exec_lo, s2
151+
; GFX12-NEXT: s_wait_loadcnt 0x0
152+
; GFX12-NEXT: v_mov_b32_e32 v0, v6
153+
; GFX12-NEXT: ; return to shader part epilog
154+
%val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
155+
%zext = zext i16 %val to i32
156+
%cast = bitcast i32 %zext to float
157+
ret float %cast
158+
}
159+
160+
define amdgpu_ps float @raw_buffer_load_i16__vgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) {
161+
; GFX12-LABEL: raw_buffer_load_i16__vgpr_rsrc__vgpr_voffset__sgpr_soffset_sext:
162+
; GFX12: ; %bb.0:
163+
; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
164+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
165+
; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
166+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
167+
; GFX12-NEXT: v_readfirstlane_b32 s4, v5
168+
; GFX12-NEXT: s_wait_loadcnt 0x0
169+
; GFX12-NEXT: v_readfirstlane_b32 s5, v6
170+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
171+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
172+
; GFX12-NEXT: s_wait_alu 0xf1ff
173+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
174+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
175+
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[2:3]
176+
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
177+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
178+
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
179+
; GFX12-NEXT: buffer_load_i16 v6, v4, s[4:7], s0 offen
180+
; GFX12-NEXT: ; implicit-def: $vgpr5
181+
; GFX12-NEXT: ; implicit-def: $vgpr4
182+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
183+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
184+
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
185+
; GFX12-NEXT: ; %bb.2:
186+
; GFX12-NEXT: s_mov_b32 exec_lo, s2
187+
; GFX12-NEXT: s_wait_loadcnt 0x0
188+
; GFX12-NEXT: v_mov_b32_e32 v0, v6
189+
; GFX12-NEXT: ; return to shader part epilog
190+
%val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
191+
%sext = sext i16 %val to i32
192+
%cast = bitcast i32 %sext to float
193+
ret float %cast
194+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
55
; FIXME: Test with SI when argument lowering not broken for f16
66

77
; Natural mapping

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
33
; FIXME: Test with SI when argument lowering not broken for f16
44

55
; Natural mapping

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
55

66
; Natural mapping
77
define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
33

44
; Natural mapping
55
define amdgpu_ps float @struct_ptr_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {

0 commit comments

Comments
 (0)