Skip to content

Commit 0a29f91

Browse files
committed
[AMDGPU][GISel] Add RegBankLegalize support for G_STRICT_{FADD|FSUB|FMUL}
1 parent 5c15f57 commit 0a29f91

File tree

11 files changed

+201
-91
lines changed

11 files changed

+201
-91
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
120120
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
121121
case UniV2S16:
122122
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
123+
case UniV2S32:
124+
return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
125+
case UniV2S64:
126+
return MRI.getType(Reg) == LLT::fixed_vector(2, 64) && MUI.isUniform(Reg);
123127
case UniB32:
124128
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
125129
case UniB64:
@@ -160,6 +164,10 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
160164
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
161165
case DivV2S16:
162166
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
167+
case DivV2S32:
168+
return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
169+
case DivV2S64:
170+
return MRI.getType(Reg) == LLT::fixed_vector(2, 64) && MUI.isDivergent(Reg);
163171
case DivB32:
164172
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
165173
case DivB64:
@@ -991,6 +999,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
991999
.Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
9921000
.Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
9931001

1002+
addRulesForGOpcs({G_STRICT_FADD, G_STRICT_FSUB, G_STRICT_FMUL}, Standard)
1003+
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1004+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1005+
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1006+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1007+
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1008+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1009+
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
1010+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1011+
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
1012+
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}})
1013+
.Any({{UniV2S64}, {{UniInVgprV2S64}, {VgprV2S64, VgprV2S64}}})
1014+
.Any({{DivV2S64}, {{VgprV2S64}, {VgprV2S64, VgprV2S64}}});
1015+
9941016
using namespace Intrinsic;
9951017

9961018
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,10 @@ enum UniformityLLTOpPredicateID {
9595

9696
UniV2S16,
9797
UniV2S32,
98-
98+
UniV2S64,
9999
DivV2S16,
100100
DivV2S32,
101+
DivV2S64,
101102

102103
// B types
103104
B32,
@@ -172,6 +173,7 @@ enum RegBankLLTMappingApplyID {
172173
VgprPtr128,
173174
VgprV2S16,
174175
VgprV2S32,
176+
VgprV2S64,
175177
VgprB32,
176178
VgprB64,
177179
VgprB96,
@@ -187,6 +189,7 @@ enum RegBankLLTMappingApplyID {
187189
UniInVgprS64,
188190
UniInVgprV2S16,
189191
UniInVgprV2S32,
192+
UniInVgprV2S64,
190193
UniInVgprV4S32,
191194
UniInVgprB32,
192195
UniInVgprB64,

llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
33
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
4+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,SDAG %s
5+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -global-isel -new-reg-bank-select -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GISEL %s
56
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
67
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
78
; FIXME: promotion not handled without f16 insts
@@ -179,12 +180,19 @@ define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
179180
; GFX8-NEXT: v_add_f16_e32 v1, v1, v3
180181
; GFX8-NEXT: s_setpc_b64 s[30:31]
181182
;
182-
; GFX10-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
183-
; GFX10: ; %bb.0:
184-
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185-
; GFX10-NEXT: v_pk_add_f16 v0, v0, v2
186-
; GFX10-NEXT: v_add_f16_e32 v1, v1, v3
187-
; GFX10-NEXT: s_setpc_b64 s[30:31]
183+
; SDAG-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
184+
; SDAG: ; %bb.0:
185+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186+
; SDAG-NEXT: v_pk_add_f16 v0, v0, v2
187+
; SDAG-NEXT: v_add_f16_e32 v1, v1, v3
188+
; SDAG-NEXT: s_setpc_b64 s[30:31]
189+
;
190+
; GISEL-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
191+
; GISEL: ; %bb.0:
192+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193+
; GISEL-NEXT: v_pk_add_f16 v0, v0, v2
194+
; GISEL-NEXT: v_pk_add_f16 v1, v1, v3
195+
; GISEL-NEXT: s_setpc_b64 s[30:31]
188196
;
189197
; GFX11-TRUE16-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
190198
; GFX11-TRUE16: ; %bb.0:
@@ -228,16 +236,23 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
228236
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
229237
; GFX8-NEXT: s_setpc_b64 s[30:31]
230238
;
231-
; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
232-
; GFX10: ; %bb.0:
233-
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234-
; GFX10-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
235-
; GFX10-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
236-
; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
237-
; GFX10-NEXT: v_add_f16_e32 v1, v1, v3
238-
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
239-
; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
240-
; GFX10-NEXT: s_setpc_b64 s[30:31]
239+
; SDAG-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
240+
; SDAG: ; %bb.0:
241+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242+
; SDAG-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
243+
; SDAG-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
244+
; SDAG-NEXT: v_add_f16_e32 v0, v0, v2
245+
; SDAG-NEXT: v_add_f16_e32 v1, v1, v3
246+
; SDAG-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
247+
; SDAG-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
248+
; SDAG-NEXT: s_setpc_b64 s[30:31]
249+
;
250+
; GISEL-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
251+
; GISEL: ; %bb.0:
252+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253+
; GISEL-NEXT: v_pk_add_f16 v0, v0, v2
254+
; GISEL-NEXT: v_pk_add_f16 v1, v1, v3
255+
; GISEL-NEXT: s_setpc_b64 s[30:31]
241256
;
242257
; GFX11-TRUE16-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
243258
; GFX11-TRUE16: ; %bb.0:

llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
3-
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
3+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,SDAG %s
4+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -global-isel -new-reg-bank-select -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GISEL %s
45
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
56

67
define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
@@ -206,11 +207,23 @@ define float @v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs(float %x, floa
206207
; GCN-NEXT: v_sub_f32_e64 v0, v1, |v0|
207208
; GCN-NEXT: s_setpc_b64 s[30:31]
208209
;
209-
; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
210-
; GFX10PLUS: ; %bb.0:
211-
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212-
; GFX10PLUS-NEXT: v_sub_f32_e64 v0, v1, |v0|
213-
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
210+
; SDAG-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
211+
; SDAG: ; %bb.0:
212+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213+
; SDAG-NEXT: v_sub_f32_e64 v0, v1, |v0|
214+
; SDAG-NEXT: s_setpc_b64 s[30:31]
215+
;
216+
; GISEL-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
217+
; GISEL: ; %bb.0:
218+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219+
; GISEL-NEXT: v_add_f32_e64 v0, -|v0|, v1
220+
; GISEL-NEXT: s_setpc_b64 s[30:31]
221+
;
222+
; GFX11-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
223+
; GFX11: ; %bb.0:
224+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225+
; GFX11-NEXT: v_sub_f32_e64 v0, v1, |v0|
226+
; GFX11-NEXT: s_setpc_b64 s[30:31]
214227
%fabs.x = call float @llvm.fabs.f32(float %x) #0
215228
%neg.fabs.x = fneg float %fabs.x
216229
%val = call float @llvm.experimental.constrained.fadd.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")

llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
3-
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
4-
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
3+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,SDAG %s
4+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -global-isel -new-reg-bank-select -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GISEL %s
5+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
56

67
define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 {
78
; GCN-LABEL: v_constained_fadd_f64_fpexcept_strict:
@@ -15,6 +16,12 @@ define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 {
1516
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1617
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
1718
; GFX10-NEXT: s_setpc_b64 s[30:31]
19+
;
20+
; GFX11-LABEL: v_constained_fadd_f64_fpexcept_strict:
21+
; GFX11: ; %bb.0:
22+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
24+
; GFX11-NEXT: s_setpc_b64 s[30:31]
1825
%val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
1926
ret double %val
2027
}
@@ -31,6 +38,12 @@ define double @v_constained_fadd_f64_fpexcept_ignore(double %x, double %y) #0 {
3138
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3239
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
3340
; GFX10-NEXT: s_setpc_b64 s[30:31]
41+
;
42+
; GFX11-LABEL: v_constained_fadd_f64_fpexcept_ignore:
43+
; GFX11: ; %bb.0:
44+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
46+
; GFX11-NEXT: s_setpc_b64 s[30:31]
3447
%val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
3548
ret double %val
3649
}
@@ -47,6 +60,12 @@ define double @v_constained_fadd_f64_fpexcept_maytrap(double %x, double %y) #0 {
4760
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4861
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
4962
; GFX10-NEXT: s_setpc_b64 s[30:31]
63+
;
64+
; GFX11-LABEL: v_constained_fadd_f64_fpexcept_maytrap:
65+
; GFX11: ; %bb.0:
66+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
68+
; GFX11-NEXT: s_setpc_b64 s[30:31]
5069
%val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
5170
ret double %val
5271
}
@@ -65,6 +84,13 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2
6584
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
6685
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
6786
; GFX10-NEXT: s_setpc_b64 s[30:31]
87+
;
88+
; GFX11-LABEL: v_constained_fadd_v2f64_fpexcept_strict:
89+
; GFX11: ; %bb.0:
90+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
92+
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
93+
; GFX11-NEXT: s_setpc_b64 s[30:31]
6894
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
6995
ret <2 x double> %val
7096
}
@@ -83,6 +109,13 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2
83109
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
84110
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
85111
; GFX10-NEXT: s_setpc_b64 s[30:31]
112+
;
113+
; GFX11-LABEL: v_constained_fadd_v2f64_fpexcept_ignore:
114+
; GFX11: ; %bb.0:
115+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
117+
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
118+
; GFX11-NEXT: s_setpc_b64 s[30:31]
86119
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
87120
ret <2 x double> %val
88121
}
@@ -101,6 +134,13 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, <
101134
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
102135
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
103136
; GFX10-NEXT: s_setpc_b64 s[30:31]
137+
;
138+
; GFX11-LABEL: v_constained_fadd_v2f64_fpexcept_maytrap:
139+
; GFX11: ; %bb.0:
140+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
142+
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
143+
; GFX11-NEXT: s_setpc_b64 s[30:31]
104144
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
105145
ret <2 x double> %val
106146
}
@@ -121,6 +161,14 @@ define <3 x double> @v_constained_fadd_v3f64_fpexcept_strict(<3 x double> %x, <3
121161
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[8:9]
122162
; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11]
123163
; GFX10-NEXT: s_setpc_b64 s[30:31]
164+
;
165+
; GFX11-LABEL: v_constained_fadd_v3f64_fpexcept_strict:
166+
; GFX11: ; %bb.0:
167+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[6:7]
169+
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], v[8:9]
170+
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11]
171+
; GFX11-NEXT: s_setpc_b64 s[30:31]
124172
%val = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
125173
ret <3 x double> %val
126174
}
@@ -133,10 +181,24 @@ define amdgpu_ps <2 x float> @s_constained_fadd_f64_fpexcept_strict(double inreg
133181
; GCN-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
134182
; GCN-NEXT: ; return to shader part epilog
135183
;
136-
; GFX10-LABEL: s_constained_fadd_f64_fpexcept_strict:
137-
; GFX10: ; %bb.0:
138-
; GFX10-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5]
139-
; GFX10-NEXT: ; return to shader part epilog
184+
; SDAG-LABEL: s_constained_fadd_f64_fpexcept_strict:
185+
; SDAG: ; %bb.0:
186+
; SDAG-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5]
187+
; SDAG-NEXT: ; return to shader part epilog
188+
;
189+
; GISEL-LABEL: s_constained_fadd_f64_fpexcept_strict:
190+
; GISEL: ; %bb.0:
191+
; GISEL-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5]
192+
; GISEL-NEXT: v_readfirstlane_b32 s0, v0
193+
; GISEL-NEXT: v_readfirstlane_b32 s1, v1
194+
; GISEL-NEXT: v_mov_b32_e32 v0, s0
195+
; GISEL-NEXT: v_mov_b32_e32 v1, s1
196+
; GISEL-NEXT: ; return to shader part epilog
197+
;
198+
; GFX11-LABEL: s_constained_fadd_f64_fpexcept_strict:
199+
; GFX11: ; %bb.0:
200+
; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5]
201+
; GFX11-NEXT: ; return to shader part epilog
140202
%val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
141203
%cast = bitcast double %val to <2 x float>
142204
ret <2 x float> %cast

llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
3-
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
3+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
44

55
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
6-
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
6+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
77

88
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-SDAG %s
9-
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s
9+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s
1010

1111
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
1212
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
13-
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-TRUE16 %s
14-
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-FAKE16 %s
13+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-TRUE16 %s
14+
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-FAKE16 %s
1515

1616

1717
; FIXME: promotion not handled without f16 insts
@@ -454,14 +454,19 @@ define amdgpu_ps <2 x half> @s_constained_fmul_v2f16_fpexcept_strict(<2 x half>
454454
;
455455
; GFX8-GISEL-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
456456
; GFX8-GISEL: ; %bb.0:
457-
; GFX8-GISEL-NEXT: s_lshr_b32 s0, s2, 16
458-
; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16
459457
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s3
460-
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
461-
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
458+
; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16
462459
; GFX8-GISEL-NEXT: v_mul_f16_e32 v0, s2, v0
463-
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
464-
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
460+
; GFX8-GISEL-NEXT: s_lshr_b32 s0, s2, 16
461+
; GFX8-GISEL-NEXT: v_readfirstlane_b32 s2, v0
462+
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s1
463+
; GFX8-GISEL-NEXT: v_mul_f16_e32 v0, s0, v0
464+
; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0
465+
; GFX8-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
466+
; GFX8-GISEL-NEXT: s_and_b32 s1, 0xffff, s2
467+
; GFX8-GISEL-NEXT: s_lshl_b32 s0, s0, 16
468+
; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0
469+
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
465470
; GFX8-GISEL-NEXT: ; return to shader part epilog
466471
;
467472
; GFX10PLUS-LABEL: s_constained_fmul_v2f16_fpexcept_strict:

0 commit comments

Comments
 (0)