Skip to content

Commit dacc23b

Browse files
author
Hugh Delaney
committed
Add patterns for fma.relu.{f16|bf16}
Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and bf16 types.
1 parent 05b6c2e commit dacc23b

File tree

2 files changed

+377
-0
lines changed

2 files changed

+377
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3917,3 +3917,24 @@ def atomic_thread_fence_seq_cst_cta :
39173917
def atomic_thread_fence_acq_rel_cta :
39183918
NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
39193919
Requires<[hasPTX<60>, hasSM<70>]>;
3920+
3921+
def fpimm0 : FPImmLeaf<fAny, [{
3922+
return Imm.isExactlyValue(+0.0);
3923+
}]>;
3924+
3925+
def FMARELU_F16 :
3926+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3927+
"fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
3928+
Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
3929+
def FMARELU_BF16 :
3930+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3931+
"fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
3932+
Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
3933+
3934+
def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3935+
(FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3936+
Requires<[allowFMA, allowUnsafeFPMath]>;
3937+
3938+
def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3939+
(FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3940+
Requires<[allowFMA, allowUnsafeFPMath]>;
Lines changed: 356 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
4+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA
5+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_70 -mattr=ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-NO-ARCH
6+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_70 -mattr=ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-NO-PTX
7+
8+
define half @fma_f16(half %a, half %b, half %c) {
9+
; CHECK-LABEL: fma_f16(
10+
; CHECK: {
11+
; CHECK-NEXT: .reg .b16 %rs<5>;
12+
; CHECK-EMPTY:
13+
; CHECK-NEXT: // %bb.0:
14+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
15+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
16+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
17+
; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
18+
; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs4;
19+
; CHECK-NEXT: ret;
20+
;
21+
; CHECK-NO-FMA-LABEL: fma_f16(
22+
; CHECK-NO-FMA: {
23+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
24+
; CHECK-NO-FMA-EMPTY:
25+
; CHECK-NO-FMA-NEXT: // %bb.0:
26+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
27+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
28+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
29+
; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
30+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
31+
; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
32+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0+0], %rs6;
33+
; CHECK-NO-FMA-NEXT: ret;
34+
;
35+
; CHECK-NO-ARCH-LABEL: fma_f16(
36+
; CHECK-NO-ARCH: {
37+
; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<6>;
38+
; CHECK-NO-ARCH-NEXT: .reg .f32 %f<3>;
39+
; CHECK-NO-ARCH-EMPTY:
40+
; CHECK-NO-ARCH-NEXT: // %bb.0:
41+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
42+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
43+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
44+
; CHECK-NO-ARCH-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
45+
; CHECK-NO-ARCH-NEXT: cvt.f32.f16 %f1, %rs4;
46+
; CHECK-NO-ARCH-NEXT: max.f32 %f2, %f1, 0f00000000;
47+
; CHECK-NO-ARCH-NEXT: cvt.rn.f16.f32 %rs5, %f2;
48+
; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0+0], %rs5;
49+
; CHECK-NO-ARCH-NEXT: ret;
50+
;
51+
; CHECK-NO-PTX-LABEL: fma_f16(
52+
; CHECK-NO-PTX: {
53+
; CHECK-NO-PTX-NEXT: .reg .b16 %rs<6>;
54+
; CHECK-NO-PTX-NEXT: .reg .f32 %f<3>;
55+
; CHECK-NO-PTX-EMPTY:
56+
; CHECK-NO-PTX-NEXT: // %bb.0:
57+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
58+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
59+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
60+
; CHECK-NO-PTX-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
61+
; CHECK-NO-PTX-NEXT: cvt.f32.f16 %f1, %rs4;
62+
; CHECK-NO-PTX-NEXT: max.f32 %f2, %f1, 0f00000000;
63+
; CHECK-NO-PTX-NEXT: cvt.rn.f16.f32 %rs5, %f2;
64+
; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0+0], %rs5;
65+
; CHECK-NO-PTX-NEXT: ret;
66+
%1 = call half @llvm.fma.f16(half %a, half %b, half %c)
67+
%2 = fcmp ogt half %1, 0.0
68+
%3 = select i1 %2, half %1, half 0.0
69+
ret half %3
70+
}
71+
72+
define half @fma_f16_expanded(half %a, half %b, half %c) {
73+
; CHECK-LABEL: fma_f16_expanded(
74+
; CHECK: {
75+
; CHECK-NEXT: .reg .b16 %rs<5>;
76+
; CHECK-EMPTY:
77+
; CHECK-NEXT: // %bb.0:
78+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
79+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
80+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
81+
; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
82+
; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs4;
83+
; CHECK-NEXT: ret;
84+
;
85+
; CHECK-NO-FMA-LABEL: fma_f16_expanded(
86+
; CHECK-NO-FMA: {
87+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
88+
; CHECK-NO-FMA-EMPTY:
89+
; CHECK-NO-FMA-NEXT: // %bb.0:
90+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
91+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
92+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
93+
; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
94+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
95+
; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
96+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0+0], %rs6;
97+
; CHECK-NO-FMA-NEXT: ret;
98+
;
99+
; CHECK-NO-ARCH-LABEL: fma_f16_expanded(
100+
; CHECK-NO-ARCH: {
101+
; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<6>;
102+
; CHECK-NO-ARCH-NEXT: .reg .f32 %f<3>;
103+
; CHECK-NO-ARCH-EMPTY:
104+
; CHECK-NO-ARCH-NEXT: // %bb.0:
105+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
106+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
107+
; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
108+
; CHECK-NO-ARCH-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
109+
; CHECK-NO-ARCH-NEXT: cvt.f32.f16 %f1, %rs4;
110+
; CHECK-NO-ARCH-NEXT: max.f32 %f2, %f1, 0f00000000;
111+
; CHECK-NO-ARCH-NEXT: cvt.rn.f16.f32 %rs5, %f2;
112+
; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0+0], %rs5;
113+
; CHECK-NO-ARCH-NEXT: ret;
114+
;
115+
; CHECK-NO-PTX-LABEL: fma_f16_expanded(
116+
; CHECK-NO-PTX: {
117+
; CHECK-NO-PTX-NEXT: .reg .b16 %rs<6>;
118+
; CHECK-NO-PTX-NEXT: .reg .f32 %f<3>;
119+
; CHECK-NO-PTX-EMPTY:
120+
; CHECK-NO-PTX-NEXT: // %bb.0:
121+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
122+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
123+
; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
124+
; CHECK-NO-PTX-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
125+
; CHECK-NO-PTX-NEXT: cvt.f32.f16 %f1, %rs4;
126+
; CHECK-NO-PTX-NEXT: max.f32 %f2, %f1, 0f00000000;
127+
; CHECK-NO-PTX-NEXT: cvt.rn.f16.f32 %rs5, %f2;
128+
; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0+0], %rs5;
129+
; CHECK-NO-PTX-NEXT: ret;
130+
%1 = fmul half %a, %b
131+
%2 = fadd half %1, %c
132+
%3 = fcmp ogt half %2, 0.0
133+
%4 = select i1 %3, half %2, half 0.0
134+
ret half %4
135+
}
136+
137+
define bfloat @fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
138+
; CHECK-LABEL: fma_bf16(
139+
; CHECK: {
140+
; CHECK-NEXT: .reg .b16 %rs<5>;
141+
; CHECK-EMPTY:
142+
; CHECK-NEXT: // %bb.0:
143+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
144+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
145+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
146+
; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
147+
; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs4;
148+
; CHECK-NEXT: ret;
149+
;
150+
; CHECK-NO-FMA-LABEL: fma_bf16(
151+
; CHECK-NO-FMA: {
152+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
153+
; CHECK-NO-FMA-EMPTY:
154+
; CHECK-NO-FMA-NEXT: // %bb.0:
155+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
156+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
157+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
158+
; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
159+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
160+
; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
161+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0+0], %rs6;
162+
; CHECK-NO-FMA-NEXT: ret;
163+
;
164+
; CHECK-NO-ARCH-LABEL: fma_bf16(
165+
; CHECK-NO-ARCH: {
166+
; CHECK-NO-ARCH-NEXT: .reg .pred %p<3>;
167+
; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<3>;
168+
; CHECK-NO-ARCH-NEXT: .reg .b32 %r<20>;
169+
; CHECK-NO-ARCH-NEXT: .reg .f32 %f<7>;
170+
; CHECK-NO-ARCH-EMPTY:
171+
; CHECK-NO-ARCH-NEXT: // %bb.0:
172+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r1, [fma_bf16_param_2];
173+
; CHECK-NO-ARCH-NEXT: shl.b32 %r2, %r1, 16;
174+
; CHECK-NO-ARCH-NEXT: mov.b32 %f1, %r2;
175+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r3, [fma_bf16_param_1];
176+
; CHECK-NO-ARCH-NEXT: shl.b32 %r4, %r3, 16;
177+
; CHECK-NO-ARCH-NEXT: mov.b32 %f2, %r4;
178+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r5, [fma_bf16_param_0];
179+
; CHECK-NO-ARCH-NEXT: shl.b32 %r6, %r5, 16;
180+
; CHECK-NO-ARCH-NEXT: mov.b32 %f3, %r6;
181+
; CHECK-NO-ARCH-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
182+
; CHECK-NO-ARCH-NEXT: mov.b32 %r7, %f4;
183+
; CHECK-NO-ARCH-NEXT: bfe.u32 %r8, %r7, 16, 1;
184+
; CHECK-NO-ARCH-NEXT: add.s32 %r9, %r8, %r7;
185+
; CHECK-NO-ARCH-NEXT: add.s32 %r10, %r9, 32767;
186+
; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p1, %f4, %f4;
187+
; CHECK-NO-ARCH-NEXT: or.b32 %r11, %r7, 4194304;
188+
; CHECK-NO-ARCH-NEXT: selp.b32 %r12, %r11, %r10, %p1;
189+
; CHECK-NO-ARCH-NEXT: and.b32 %r13, %r12, -65536;
190+
; CHECK-NO-ARCH-NEXT: mov.b32 %f5, %r13;
191+
; CHECK-NO-ARCH-NEXT: max.f32 %f6, %f5, 0f00000000;
192+
; CHECK-NO-ARCH-NEXT: mov.b32 %r14, %f6;
193+
; CHECK-NO-ARCH-NEXT: bfe.u32 %r15, %r14, 16, 1;
194+
; CHECK-NO-ARCH-NEXT: add.s32 %r16, %r15, %r14;
195+
; CHECK-NO-ARCH-NEXT: add.s32 %r17, %r16, 32767;
196+
; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p2, %f6, %f6;
197+
; CHECK-NO-ARCH-NEXT: or.b32 %r18, %r14, 4194304;
198+
; CHECK-NO-ARCH-NEXT: selp.b32 %r19, %r18, %r17, %p2;
199+
; CHECK-NO-ARCH-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
200+
; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0+0], %rs1;
201+
; CHECK-NO-ARCH-NEXT: ret;
202+
;
203+
; CHECK-NO-PTX-LABEL: fma_bf16(
204+
; CHECK-NO-PTX: {
205+
; CHECK-NO-PTX-NEXT: .reg .pred %p<3>;
206+
; CHECK-NO-PTX-NEXT: .reg .b16 %rs<3>;
207+
; CHECK-NO-PTX-NEXT: .reg .b32 %r<20>;
208+
; CHECK-NO-PTX-NEXT: .reg .f32 %f<7>;
209+
; CHECK-NO-PTX-EMPTY:
210+
; CHECK-NO-PTX-NEXT: // %bb.0:
211+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r1, [fma_bf16_param_2];
212+
; CHECK-NO-PTX-NEXT: shl.b32 %r2, %r1, 16;
213+
; CHECK-NO-PTX-NEXT: mov.b32 %f1, %r2;
214+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r3, [fma_bf16_param_1];
215+
; CHECK-NO-PTX-NEXT: shl.b32 %r4, %r3, 16;
216+
; CHECK-NO-PTX-NEXT: mov.b32 %f2, %r4;
217+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r5, [fma_bf16_param_0];
218+
; CHECK-NO-PTX-NEXT: shl.b32 %r6, %r5, 16;
219+
; CHECK-NO-PTX-NEXT: mov.b32 %f3, %r6;
220+
; CHECK-NO-PTX-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
221+
; CHECK-NO-PTX-NEXT: mov.b32 %r7, %f4;
222+
; CHECK-NO-PTX-NEXT: bfe.u32 %r8, %r7, 16, 1;
223+
; CHECK-NO-PTX-NEXT: add.s32 %r9, %r8, %r7;
224+
; CHECK-NO-PTX-NEXT: add.s32 %r10, %r9, 32767;
225+
; CHECK-NO-PTX-NEXT: setp.nan.f32 %p1, %f4, %f4;
226+
; CHECK-NO-PTX-NEXT: or.b32 %r11, %r7, 4194304;
227+
; CHECK-NO-PTX-NEXT: selp.b32 %r12, %r11, %r10, %p1;
228+
; CHECK-NO-PTX-NEXT: and.b32 %r13, %r12, -65536;
229+
; CHECK-NO-PTX-NEXT: mov.b32 %f5, %r13;
230+
; CHECK-NO-PTX-NEXT: max.f32 %f6, %f5, 0f00000000;
231+
; CHECK-NO-PTX-NEXT: mov.b32 %r14, %f6;
232+
; CHECK-NO-PTX-NEXT: bfe.u32 %r15, %r14, 16, 1;
233+
; CHECK-NO-PTX-NEXT: add.s32 %r16, %r15, %r14;
234+
; CHECK-NO-PTX-NEXT: add.s32 %r17, %r16, 32767;
235+
; CHECK-NO-PTX-NEXT: setp.nan.f32 %p2, %f6, %f6;
236+
; CHECK-NO-PTX-NEXT: or.b32 %r18, %r14, 4194304;
237+
; CHECK-NO-PTX-NEXT: selp.b32 %r19, %r18, %r17, %p2;
238+
; CHECK-NO-PTX-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
239+
; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0+0], %rs1;
240+
; CHECK-NO-PTX-NEXT: ret;
241+
%1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
242+
%2 = fcmp ogt bfloat %1, 0.0
243+
%3 = select i1 %2, bfloat %1, bfloat 0.0
244+
ret bfloat %3
245+
}
246+
247+
define bfloat @fma_bf16_expanded(bfloat %a, bfloat %b, bfloat %c) {
248+
; CHECK-LABEL: fma_bf16_expanded(
249+
; CHECK: {
250+
; CHECK-NEXT: .reg .b16 %rs<5>;
251+
; CHECK-EMPTY:
252+
; CHECK-NEXT: // %bb.0:
253+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
254+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
255+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
256+
; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
257+
; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs4;
258+
; CHECK-NEXT: ret;
259+
;
260+
; CHECK-NO-FMA-LABEL: fma_bf16_expanded(
261+
; CHECK-NO-FMA: {
262+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
263+
; CHECK-NO-FMA-EMPTY:
264+
; CHECK-NO-FMA-NEXT: // %bb.0:
265+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
266+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
267+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
268+
; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
269+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
270+
; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
271+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0+0], %rs6;
272+
; CHECK-NO-FMA-NEXT: ret;
273+
;
274+
; CHECK-NO-ARCH-LABEL: fma_bf16_expanded(
275+
; CHECK-NO-ARCH: {
276+
; CHECK-NO-ARCH-NEXT: .reg .pred %p<3>;
277+
; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<3>;
278+
; CHECK-NO-ARCH-NEXT: .reg .b32 %r<20>;
279+
; CHECK-NO-ARCH-NEXT: .reg .f32 %f<7>;
280+
; CHECK-NO-ARCH-EMPTY:
281+
; CHECK-NO-ARCH-NEXT: // %bb.0:
282+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_param_2];
283+
; CHECK-NO-ARCH-NEXT: shl.b32 %r2, %r1, 16;
284+
; CHECK-NO-ARCH-NEXT: mov.b32 %f1, %r2;
285+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_param_1];
286+
; CHECK-NO-ARCH-NEXT: shl.b32 %r4, %r3, 16;
287+
; CHECK-NO-ARCH-NEXT: mov.b32 %f2, %r4;
288+
; CHECK-NO-ARCH-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_param_0];
289+
; CHECK-NO-ARCH-NEXT: shl.b32 %r6, %r5, 16;
290+
; CHECK-NO-ARCH-NEXT: mov.b32 %f3, %r6;
291+
; CHECK-NO-ARCH-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
292+
; CHECK-NO-ARCH-NEXT: mov.b32 %r7, %f4;
293+
; CHECK-NO-ARCH-NEXT: bfe.u32 %r8, %r7, 16, 1;
294+
; CHECK-NO-ARCH-NEXT: add.s32 %r9, %r8, %r7;
295+
; CHECK-NO-ARCH-NEXT: add.s32 %r10, %r9, 32767;
296+
; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p1, %f4, %f4;
297+
; CHECK-NO-ARCH-NEXT: or.b32 %r11, %r7, 4194304;
298+
; CHECK-NO-ARCH-NEXT: selp.b32 %r12, %r11, %r10, %p1;
299+
; CHECK-NO-ARCH-NEXT: and.b32 %r13, %r12, -65536;
300+
; CHECK-NO-ARCH-NEXT: mov.b32 %f5, %r13;
301+
; CHECK-NO-ARCH-NEXT: max.f32 %f6, %f5, 0f00000000;
302+
; CHECK-NO-ARCH-NEXT: mov.b32 %r14, %f6;
303+
; CHECK-NO-ARCH-NEXT: bfe.u32 %r15, %r14, 16, 1;
304+
; CHECK-NO-ARCH-NEXT: add.s32 %r16, %r15, %r14;
305+
; CHECK-NO-ARCH-NEXT: add.s32 %r17, %r16, 32767;
306+
; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p2, %f6, %f6;
307+
; CHECK-NO-ARCH-NEXT: or.b32 %r18, %r14, 4194304;
308+
; CHECK-NO-ARCH-NEXT: selp.b32 %r19, %r18, %r17, %p2;
309+
; CHECK-NO-ARCH-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
310+
; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0+0], %rs1;
311+
; CHECK-NO-ARCH-NEXT: ret;
312+
;
313+
; CHECK-NO-PTX-LABEL: fma_bf16_expanded(
314+
; CHECK-NO-PTX: {
315+
; CHECK-NO-PTX-NEXT: .reg .pred %p<3>;
316+
; CHECK-NO-PTX-NEXT: .reg .b16 %rs<3>;
317+
; CHECK-NO-PTX-NEXT: .reg .b32 %r<20>;
318+
; CHECK-NO-PTX-NEXT: .reg .f32 %f<7>;
319+
; CHECK-NO-PTX-EMPTY:
320+
; CHECK-NO-PTX-NEXT: // %bb.0:
321+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_param_2];
322+
; CHECK-NO-PTX-NEXT: shl.b32 %r2, %r1, 16;
323+
; CHECK-NO-PTX-NEXT: mov.b32 %f1, %r2;
324+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_param_1];
325+
; CHECK-NO-PTX-NEXT: shl.b32 %r4, %r3, 16;
326+
; CHECK-NO-PTX-NEXT: mov.b32 %f2, %r4;
327+
; CHECK-NO-PTX-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_param_0];
328+
; CHECK-NO-PTX-NEXT: shl.b32 %r6, %r5, 16;
329+
; CHECK-NO-PTX-NEXT: mov.b32 %f3, %r6;
330+
; CHECK-NO-PTX-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
331+
; CHECK-NO-PTX-NEXT: mov.b32 %r7, %f4;
332+
; CHECK-NO-PTX-NEXT: bfe.u32 %r8, %r7, 16, 1;
333+
; CHECK-NO-PTX-NEXT: add.s32 %r9, %r8, %r7;
334+
; CHECK-NO-PTX-NEXT: add.s32 %r10, %r9, 32767;
335+
; CHECK-NO-PTX-NEXT: setp.nan.f32 %p1, %f4, %f4;
336+
; CHECK-NO-PTX-NEXT: or.b32 %r11, %r7, 4194304;
337+
; CHECK-NO-PTX-NEXT: selp.b32 %r12, %r11, %r10, %p1;
338+
; CHECK-NO-PTX-NEXT: and.b32 %r13, %r12, -65536;
339+
; CHECK-NO-PTX-NEXT: mov.b32 %f5, %r13;
340+
; CHECK-NO-PTX-NEXT: max.f32 %f6, %f5, 0f00000000;
341+
; CHECK-NO-PTX-NEXT: mov.b32 %r14, %f6;
342+
; CHECK-NO-PTX-NEXT: bfe.u32 %r15, %r14, 16, 1;
343+
; CHECK-NO-PTX-NEXT: add.s32 %r16, %r15, %r14;
344+
; CHECK-NO-PTX-NEXT: add.s32 %r17, %r16, 32767;
345+
; CHECK-NO-PTX-NEXT: setp.nan.f32 %p2, %f6, %f6;
346+
; CHECK-NO-PTX-NEXT: or.b32 %r18, %r14, 4194304;
347+
; CHECK-NO-PTX-NEXT: selp.b32 %r19, %r18, %r17, %p2;
348+
; CHECK-NO-PTX-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
349+
; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0+0], %rs1;
350+
; CHECK-NO-PTX-NEXT: ret;
351+
%1 = fmul bfloat %a, %b
352+
%2 = fadd bfloat %1, %c
353+
%3 = fcmp ogt bfloat %2, 0.0
354+
%4 = select i1 %3, bfloat %2, bfloat 0.0
355+
ret bfloat %4
356+
}

0 commit comments

Comments
 (0)