Skip to content

Commit d162afa

Browse files
authored
[AMDGPU][GlobalISel] Add RegBankLegalize support for G_FPEXT (llvm#171483)
1 parent 8adcf0a commit d162afa

File tree

3 files changed

+276
-2
lines changed

3 files changed

+276
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
976976
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
977977
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
978978

979+
addRulesForGOpcs({G_FPEXT})
980+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
981+
.Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
982+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
983+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
984+
.Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
985+
979986
addRulesForGOpcs({G_IS_FPCLASS})
980987
.Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
981988
.Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
6+
7+
define amdgpu_ps float @fpext_f16_to_f32_uniform(half inreg %a) {
8+
; GFX11-LABEL: fpext_f16_to_f32_uniform:
9+
; GFX11: ; %bb.0:
10+
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
11+
; GFX11-NEXT: ; return to shader part epilog
12+
;
13+
; GFX12-LABEL: fpext_f16_to_f32_uniform:
14+
; GFX12: ; %bb.0:
15+
; GFX12-NEXT: s_cvt_f32_f16 s0, s0
16+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
17+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
18+
; GFX12-NEXT: ; return to shader part epilog
19+
%result = fpext half %a to float
20+
ret float %result
21+
}
22+
23+
define amdgpu_ps float @fpext_f16_to_f32_div(half %a) {
24+
; GFX11-FAKE16-LABEL: fpext_f16_to_f32_div:
25+
; GFX11-FAKE16: ; %bb.0:
26+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
27+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
28+
;
29+
; GFX11-TRUE16-LABEL: fpext_f16_to_f32_div:
30+
; GFX11-TRUE16: ; %bb.0:
31+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
32+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
33+
;
34+
; GFX12-FAKE16-LABEL: fpext_f16_to_f32_div:
35+
; GFX12-FAKE16: ; %bb.0:
36+
; GFX12-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
37+
; GFX12-FAKE16-NEXT: ; return to shader part epilog
38+
;
39+
; GFX12-TRUE16-LABEL: fpext_f16_to_f32_div:
40+
; GFX12-TRUE16: ; %bb.0:
41+
; GFX12-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
42+
; GFX12-TRUE16-NEXT: ; return to shader part epilog
43+
%result = fpext half %a to float
44+
ret float %result
45+
}
46+
47+
define amdgpu_ps void @fpext_f32_to_f64_uniform(float inreg %a, ptr addrspace(1) %ptr) {
48+
; GFX11-LABEL: fpext_f32_to_f64_uniform:
49+
; GFX11: ; %bb.0:
50+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], s0
51+
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
52+
; GFX11-NEXT: s_endpgm
53+
;
54+
; GFX12-LABEL: fpext_f32_to_f64_uniform:
55+
; GFX12: ; %bb.0:
56+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[2:3], s0
57+
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
58+
; GFX12-NEXT: s_endpgm
59+
%result = fpext float %a to double
60+
store double %result, ptr addrspace(1) %ptr
61+
ret void
62+
}
63+
64+
define amdgpu_ps void @fpext_f32_to_f64_div(float %a, ptr addrspace(1) %ptr) {
65+
; GFX11-LABEL: fpext_f32_to_f64_div:
66+
; GFX11: ; %bb.0:
67+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
68+
; GFX11-NEXT: global_store_b64 v[1:2], v[3:4], off
69+
; GFX11-NEXT: s_endpgm
70+
;
71+
; GFX12-LABEL: fpext_f32_to_f64_div:
72+
; GFX12: ; %bb.0:
73+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
74+
; GFX12-NEXT: global_store_b64 v[1:2], v[3:4], off
75+
; GFX12-NEXT: s_endpgm
76+
%result = fpext float %a to double
77+
store double %result, ptr addrspace(1) %ptr
78+
ret void
79+
}
80+
81+
define amdgpu_ps void @fpext_f16_to_f64_uniform(half inreg %a, ptr addrspace(1) %ptr) {
82+
; GFX11-LABEL: fpext_f16_to_f64_uniform:
83+
; GFX11: ; %bb.0:
84+
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
85+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
86+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
87+
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
88+
; GFX11-NEXT: s_endpgm
89+
;
90+
; GFX12-LABEL: fpext_f16_to_f64_uniform:
91+
; GFX12: ; %bb.0:
92+
; GFX12-NEXT: s_cvt_f32_f16 s0, s0
93+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
94+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[2:3], s0
95+
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
96+
; GFX12-NEXT: s_endpgm
97+
%result = fpext half %a to double
98+
store double %result, ptr addrspace(1) %ptr
99+
ret void
100+
}
101+
102+
define amdgpu_ps void @fpext_f16_to_f64_div(half %a, ptr addrspace(1) %ptr) {
103+
; GFX11-FAKE16-LABEL: fpext_f16_to_f64_div:
104+
; GFX11-FAKE16: ; %bb.0:
105+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
106+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
107+
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
108+
; GFX11-FAKE16-NEXT: global_store_b64 v[1:2], v[3:4], off
109+
; GFX11-FAKE16-NEXT: s_endpgm
110+
;
111+
; GFX11-TRUE16-LABEL: fpext_f16_to_f64_div:
112+
; GFX11-TRUE16: ; %bb.0:
113+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
114+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
115+
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
116+
; GFX11-TRUE16-NEXT: global_store_b64 v[1:2], v[3:4], off
117+
; GFX11-TRUE16-NEXT: s_endpgm
118+
;
119+
; GFX12-FAKE16-LABEL: fpext_f16_to_f64_div:
120+
; GFX12-FAKE16: ; %bb.0:
121+
; GFX12-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
122+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
123+
; GFX12-FAKE16-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
124+
; GFX12-FAKE16-NEXT: global_store_b64 v[1:2], v[3:4], off
125+
; GFX12-FAKE16-NEXT: s_endpgm
126+
;
127+
; GFX12-TRUE16-LABEL: fpext_f16_to_f64_div:
128+
; GFX12-TRUE16: ; %bb.0:
129+
; GFX12-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
130+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
131+
; GFX12-TRUE16-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
132+
; GFX12-TRUE16-NEXT: global_store_b64 v[1:2], v[3:4], off
133+
; GFX12-TRUE16-NEXT: s_endpgm
134+
%result = fpext half %a to double
135+
store double %result, ptr addrspace(1) %ptr
136+
ret void
137+
}
138+
139+
define amdgpu_ps <2 x float> @fpext_v2f16_to_v2f32_uniform(<2 x half> inreg %a) {
140+
; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
141+
; GFX11-FAKE16: ; %bb.0:
142+
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 16
143+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s0
144+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s1
145+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
146+
;
147+
; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
148+
; GFX11-TRUE16: ; %bb.0:
149+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0
150+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
151+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
152+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
153+
;
154+
; GFX12-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
155+
; GFX12-FAKE16: ; %bb.0:
156+
; GFX12-FAKE16-NEXT: s_cvt_f32_f16 s1, s0
157+
; GFX12-FAKE16-NEXT: s_cvt_hi_f32_f16 s0, s0
158+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
159+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
160+
; GFX12-FAKE16-NEXT: ; return to shader part epilog
161+
;
162+
; GFX12-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
163+
; GFX12-TRUE16: ; %bb.0:
164+
; GFX12-TRUE16-NEXT: s_cvt_f32_f16 s0, s0
165+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
166+
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0
167+
; GFX12-TRUE16-NEXT: ; return to shader part epilog
168+
%result = fpext <2 x half> %a to <2 x float>
169+
ret <2 x float> %result
170+
}
171+
172+
define amdgpu_ps <2 x float> @fpext_v2f16_to_v2f32_div(<2 x half> %a) {
173+
; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32_div:
174+
; GFX11-FAKE16: ; %bb.0:
175+
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
176+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
177+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
178+
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
179+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
180+
;
181+
; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32_div:
182+
; GFX11-TRUE16: ; %bb.0:
183+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v0.l
184+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.h
185+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
186+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
187+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
188+
;
189+
; GFX12-FAKE16-LABEL: fpext_v2f16_to_v2f32_div:
190+
; GFX12-FAKE16: ; %bb.0:
191+
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
192+
; GFX12-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
193+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
194+
; GFX12-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
195+
; GFX12-FAKE16-NEXT: ; return to shader part epilog
196+
;
197+
; GFX12-TRUE16-LABEL: fpext_v2f16_to_v2f32_div:
198+
; GFX12-TRUE16: ; %bb.0:
199+
; GFX12-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v0.l
200+
; GFX12-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.h
201+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
202+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
203+
; GFX12-TRUE16-NEXT: ; return to shader part epilog
204+
%result = fpext <2 x half> %a to <2 x float>
205+
ret <2 x float> %result
206+
}
207+
208+
define amdgpu_ps void @fpext_v2f32_to_v2f64_uniform(<2 x float> inreg %a, ptr addrspace(1) %ptr) {
209+
; GFX11-LABEL: fpext_v2f32_to_v2f64_uniform:
210+
; GFX11: ; %bb.0:
211+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], s1
212+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], s0
213+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
214+
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
215+
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
216+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
217+
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
218+
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
219+
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0
220+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
221+
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2
222+
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
223+
; GFX11-NEXT: s_endpgm
224+
;
225+
; GFX12-LABEL: fpext_v2f32_to_v2f64_uniform:
226+
; GFX12: ; %bb.0:
227+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[4:5], s1
228+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[2:3], s0
229+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
230+
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
231+
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
232+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
233+
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
234+
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
235+
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
236+
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0
237+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
238+
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2
239+
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
240+
; GFX12-NEXT: s_endpgm
241+
%result = fpext <2 x float> %a to <2 x double>
242+
store <2 x double> %result, ptr addrspace(1) %ptr
243+
ret void
244+
}
245+
246+
define amdgpu_ps void @fpext_v2f32_to_v2f64_div(<2 x float> %a, ptr addrspace(1) %ptr) {
247+
; GFX11-LABEL: fpext_v2f32_to_v2f64_div:
248+
; GFX11: ; %bb.0:
249+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
250+
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
251+
; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off
252+
; GFX11-NEXT: s_endpgm
253+
;
254+
; GFX12-LABEL: fpext_v2f32_to_v2f64_div:
255+
; GFX12: ; %bb.0:
256+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
257+
; GFX12-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
258+
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
259+
; GFX12-NEXT: s_endpgm
260+
%result = fpext <2 x float> %a to <2 x double>
261+
store <2 x double> %result, ptr addrspace(1) %ptr
262+
ret void
263+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fpext.mir

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
3-
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
2+
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
44

55
---
66
name: fpext_s
@@ -15,6 +15,10 @@ body: |
1515
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
1616
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
1717
; CHECK-NEXT: [[FPEXT:%[0-9]+]]:vgpr(s64) = G_FPEXT [[COPY1]](s32)
18+
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[FPEXT]](s64)
19+
; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
20+
; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
21+
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
1822
%0:_(s32) = COPY $sgpr0
1923
%1:_(s64) = G_FPEXT %0
2024
...

0 commit comments

Comments
 (0)