Skip to content

Commit 9456007

Browse files
author
Hugh Delaney
committed
Add patterns for fma.relu.{f16|bf16}
Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and bf16 types.
1 parent 05b6c2e commit 9456007

File tree

2 files changed

+386
-0
lines changed

2 files changed

+386
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3917,3 +3917,40 @@ def atomic_thread_fence_seq_cst_cta :
39173917
def atomic_thread_fence_acq_rel_cta :
39183918
NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
39193919
Requires<[hasPTX<60>, hasSM<70>]>;
3920+
3921+
def fpimm0 : FPImmLeaf<fAny, [{
3922+
return Imm.isExactlyValue(+0.0);
3923+
}]>;
3924+
3925+
def FMARELU_F16 :
3926+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3927+
"fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
3928+
Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
3929+
def FMARELU_BF16 :
3930+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3931+
"fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
3932+
Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
3933+
def FMARELU_F16_FTZ :
3934+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3935+
"fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
3936+
Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
3937+
def FMARELU_BF16_FTZ :
3938+
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
3939+
"fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
3940+
Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
3941+
3942+
3943+
// FTZ variants
3944+
def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3945+
(FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3946+
Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
3947+
def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3948+
(FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3949+
Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
3950+
// No FTZ
3951+
def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3952+
(FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3953+
Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
3954+
def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
3955+
(FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
3956+
Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
4+
5+
; Using FTZ should emit fma.ftz.relu
6+
; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FTZ
7+
; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
8+
9+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA
10+
11+
; SM < 80 or PTX version < 70 should not emit fma{.ftz}.relu
12+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_70 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-SM70
13+
; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx60 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-PTX60
14+
15+
define half @fma_f16(half %a, half %b, half %c) {
16+
; CHECK-LABEL: fma_f16(
17+
; CHECK: {
18+
; CHECK-NEXT: .reg .b16 %rs<5>;
19+
; CHECK-EMPTY:
20+
; CHECK-NEXT: // %bb.0:
21+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
22+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
23+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
24+
; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
25+
; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
26+
; CHECK-NEXT: ret;
27+
;
28+
; CHECK-FTZ-LABEL: fma_f16(
29+
; CHECK-FTZ: {
30+
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
31+
; CHECK-FTZ-EMPTY:
32+
; CHECK-FTZ-NEXT: // %bb.0:
33+
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
34+
; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
35+
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
36+
; CHECK-FTZ-NEXT: fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
37+
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
38+
; CHECK-FTZ-NEXT: ret;
39+
;
40+
; CHECK-NO-FMA-LABEL: fma_f16(
41+
; CHECK-NO-FMA: {
42+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
43+
; CHECK-NO-FMA-EMPTY:
44+
; CHECK-NO-FMA-NEXT: // %bb.0:
45+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
46+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
47+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
48+
; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
49+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
50+
; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
51+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
52+
; CHECK-NO-FMA-NEXT: ret;
53+
;
54+
; CHECK-SM70-LABEL: fma_f16(
55+
; CHECK-SM70: {
56+
; CHECK-SM70-NEXT: .reg .b16 %rs<6>;
57+
; CHECK-SM70-NEXT: .reg .f32 %f<3>;
58+
; CHECK-SM70-EMPTY:
59+
; CHECK-SM70-NEXT: // %bb.0:
60+
; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
61+
; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
62+
; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
63+
; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
64+
; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4;
65+
; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000;
66+
; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2;
67+
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5;
68+
; CHECK-SM70-NEXT: ret;
69+
;
70+
; CHECK-PTX60-LABEL: fma_f16(
71+
; CHECK-PTX60: {
72+
; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
73+
; CHECK-PTX60-EMPTY:
74+
; CHECK-PTX60-NEXT: // %bb.0:
75+
; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
76+
; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
77+
; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
78+
; CHECK-PTX60-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
79+
; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
80+
; CHECK-PTX60-NEXT: ret;
81+
%1 = call half @llvm.fma.f16(half %a, half %b, half %c)
82+
%2 = fcmp ogt half %1, 0.0
83+
%3 = select i1 %2, half %1, half 0.0
84+
ret half %3
85+
}
86+
87+
define half @fma_f16_expanded(half %a, half %b, half %c) {
88+
; CHECK-LABEL: fma_f16_expanded(
89+
; CHECK: {
90+
; CHECK-NEXT: .reg .b16 %rs<5>;
91+
; CHECK-EMPTY:
92+
; CHECK-NEXT: // %bb.0:
93+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
94+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
95+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
96+
; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
97+
; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
98+
; CHECK-NEXT: ret;
99+
;
100+
; CHECK-FTZ-LABEL: fma_f16_expanded(
101+
; CHECK-FTZ: {
102+
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
103+
; CHECK-FTZ-EMPTY:
104+
; CHECK-FTZ-NEXT: // %bb.0:
105+
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
106+
; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
107+
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
108+
; CHECK-FTZ-NEXT: fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
109+
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
110+
; CHECK-FTZ-NEXT: ret;
111+
;
112+
; CHECK-NO-FMA-LABEL: fma_f16_expanded(
113+
; CHECK-NO-FMA: {
114+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
115+
; CHECK-NO-FMA-EMPTY:
116+
; CHECK-NO-FMA-NEXT: // %bb.0:
117+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
118+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
119+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
120+
; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
121+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
122+
; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
123+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
124+
; CHECK-NO-FMA-NEXT: ret;
125+
;
126+
; CHECK-SM70-LABEL: fma_f16_expanded(
127+
; CHECK-SM70: {
128+
; CHECK-SM70-NEXT: .reg .b16 %rs<6>;
129+
; CHECK-SM70-NEXT: .reg .f32 %f<3>;
130+
; CHECK-SM70-EMPTY:
131+
; CHECK-SM70-NEXT: // %bb.0:
132+
; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
133+
; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
134+
; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
135+
; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
136+
; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4;
137+
; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000;
138+
; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2;
139+
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5;
140+
; CHECK-SM70-NEXT: ret;
141+
;
142+
; CHECK-PTX60-LABEL: fma_f16_expanded(
143+
; CHECK-PTX60: {
144+
; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
145+
; CHECK-PTX60-EMPTY:
146+
; CHECK-PTX60-NEXT: // %bb.0:
147+
; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
148+
; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
149+
; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
150+
; CHECK-PTX60-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
151+
; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
152+
; CHECK-PTX60-NEXT: ret;
153+
%1 = fmul half %a, %b
154+
%2 = fadd half %1, %c
155+
%3 = fcmp ogt half %2, 0.0
156+
%4 = select i1 %3, half %2, half 0.0
157+
ret half %4
158+
}
159+
160+
define bfloat @fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
161+
; CHECK-LABEL: fma_bf16(
162+
; CHECK: {
163+
; CHECK-NEXT: .reg .b16 %rs<5>;
164+
; CHECK-EMPTY:
165+
; CHECK-NEXT: // %bb.0:
166+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
167+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
168+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
169+
; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
170+
; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
171+
; CHECK-NEXT: ret;
172+
;
173+
; CHECK-FTZ-LABEL: fma_bf16(
174+
; CHECK-FTZ: {
175+
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
176+
; CHECK-FTZ-EMPTY:
177+
; CHECK-FTZ-NEXT: // %bb.0:
178+
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
179+
; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
180+
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
181+
; CHECK-FTZ-NEXT: fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
182+
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
183+
; CHECK-FTZ-NEXT: ret;
184+
;
185+
; CHECK-NO-FMA-LABEL: fma_bf16(
186+
; CHECK-NO-FMA: {
187+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
188+
; CHECK-NO-FMA-EMPTY:
189+
; CHECK-NO-FMA-NEXT: // %bb.0:
190+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
191+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
192+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
193+
; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
194+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
195+
; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
196+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
197+
; CHECK-NO-FMA-NEXT: ret;
198+
;
199+
; CHECK-SM70-LABEL: fma_bf16(
200+
; CHECK-SM70: {
201+
; CHECK-SM70-NEXT: .reg .pred %p<3>;
202+
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
203+
; CHECK-SM70-NEXT: .reg .b32 %r<20>;
204+
; CHECK-SM70-NEXT: .reg .f32 %f<7>;
205+
; CHECK-SM70-EMPTY:
206+
; CHECK-SM70-NEXT: // %bb.0:
207+
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_param_2];
208+
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
209+
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
210+
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_param_1];
211+
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
212+
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
213+
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_param_0];
214+
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
215+
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
216+
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
217+
; CHECK-SM70-NEXT: mov.b32 %r7, %f4;
218+
; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
219+
; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
220+
; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
221+
; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
222+
; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
223+
; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
224+
; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
225+
; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
226+
; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
227+
; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
228+
; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
229+
; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
230+
; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
231+
; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
232+
; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
233+
; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
234+
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
235+
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
236+
; CHECK-SM70-NEXT: ret;
237+
;
238+
; CHECK-PTX60-LABEL: fma_bf16(
239+
; CHECK-PTX60: {
240+
; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
241+
; CHECK-PTX60-EMPTY:
242+
; CHECK-PTX60-NEXT: // %bb.0:
243+
; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
244+
; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
245+
; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
246+
; CHECK-PTX60-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
247+
; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
248+
; CHECK-PTX60-NEXT: ret;
249+
%1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
250+
%2 = fcmp ogt bfloat %1, 0.0
251+
%3 = select i1 %2, bfloat %1, bfloat 0.0
252+
ret bfloat %3
253+
}
254+
255+
define bfloat @fma_bf16_expanded(bfloat %a, bfloat %b, bfloat %c) {
256+
; CHECK-LABEL: fma_bf16_expanded(
257+
; CHECK: {
258+
; CHECK-NEXT: .reg .b16 %rs<5>;
259+
; CHECK-EMPTY:
260+
; CHECK-NEXT: // %bb.0:
261+
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
262+
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
263+
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
264+
; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
265+
; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
266+
; CHECK-NEXT: ret;
267+
;
268+
; CHECK-FTZ-LABEL: fma_bf16_expanded(
269+
; CHECK-FTZ: {
270+
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
271+
; CHECK-FTZ-EMPTY:
272+
; CHECK-FTZ-NEXT: // %bb.0:
273+
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
274+
; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
275+
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
276+
; CHECK-FTZ-NEXT: fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
277+
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
278+
; CHECK-FTZ-NEXT: ret;
279+
;
280+
; CHECK-NO-FMA-LABEL: fma_bf16_expanded(
281+
; CHECK-NO-FMA: {
282+
; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
283+
; CHECK-NO-FMA-EMPTY:
284+
; CHECK-NO-FMA-NEXT: // %bb.0:
285+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
286+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
287+
; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
288+
; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
289+
; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
290+
; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
291+
; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
292+
; CHECK-NO-FMA-NEXT: ret;
293+
;
294+
; CHECK-SM70-LABEL: fma_bf16_expanded(
295+
; CHECK-SM70: {
296+
; CHECK-SM70-NEXT: .reg .pred %p<3>;
297+
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
298+
; CHECK-SM70-NEXT: .reg .b32 %r<20>;
299+
; CHECK-SM70-NEXT: .reg .f32 %f<7>;
300+
; CHECK-SM70-EMPTY:
301+
; CHECK-SM70-NEXT: // %bb.0:
302+
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_param_2];
303+
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
304+
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
305+
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_param_1];
306+
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
307+
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
308+
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_param_0];
309+
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
310+
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
311+
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
312+
; CHECK-SM70-NEXT: mov.b32 %r7, %f4;
313+
; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
314+
; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
315+
; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
316+
; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
317+
; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
318+
; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
319+
; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
320+
; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
321+
; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
322+
; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
323+
; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
324+
; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
325+
; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
326+
; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
327+
; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
328+
; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
329+
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
330+
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
331+
; CHECK-SM70-NEXT: ret;
332+
;
333+
; CHECK-PTX60-LABEL: fma_bf16_expanded(
334+
; CHECK-PTX60: {
335+
; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
336+
; CHECK-PTX60-EMPTY:
337+
; CHECK-PTX60-NEXT: // %bb.0:
338+
; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
339+
; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
340+
; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
341+
; CHECK-PTX60-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
342+
; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
343+
; CHECK-PTX60-NEXT: ret;
344+
%1 = fmul bfloat %a, %b
345+
%2 = fadd bfloat %1, %c
346+
%3 = fcmp ogt bfloat %2, 0.0
347+
%4 = select i1 %3, bfloat %2, bfloat 0.0
348+
ret bfloat %4
349+
}

0 commit comments

Comments
 (0)