@@ -42,17 +42,14 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
4242;
4343; SM80-LABEL: test_fadd(
4444; SM80: {
45- ; SM80-NEXT: .reg .b16 %rs<4>;
46- ; SM80-NEXT: .reg .f32 %f<4>;
45+ ; SM80-NEXT: .reg .b16 %rs<5>;
4746; SM80-EMPTY:
4847; SM80-NEXT: // %bb.0:
4948; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
5049; SM80-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
51- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
52- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
53- ; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
54- ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
55- ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
50+ ; SM80-NEXT: mov.b16 %rs3, 0x3F80;
51+ ; SM80-NEXT: fma.rn.bf16 %rs4, %rs1, %rs3, %rs2;
52+ ; SM80-NEXT: st.param.b16 [func_retval0], %rs4;
5653; SM80-NEXT: ret;
5754;
5855; SM80-FTZ-LABEL: test_fadd(
@@ -113,17 +110,14 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
113110;
114111; SM80-LABEL: test_fsub(
115112; SM80: {
116- ; SM80-NEXT: .reg .b16 %rs<4>;
117- ; SM80-NEXT: .reg .f32 %f<4>;
113+ ; SM80-NEXT: .reg .b16 %rs<5>;
118114; SM80-EMPTY:
119115; SM80-NEXT: // %bb.0:
120116; SM80-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
121117; SM80-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
122- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
123- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
124- ; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
125- ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
126- ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
118+ ; SM80-NEXT: mov.b16 %rs3, 0xBF80;
119+ ; SM80-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
120+ ; SM80-NEXT: st.param.b16 [func_retval0], %rs4;
127121; SM80-NEXT: ret;
128122;
129123; SM80-FTZ-LABEL: test_fsub(
@@ -202,23 +196,14 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
202196;
203197; SM80-LABEL: test_faddx2(
204198; SM80: {
205- ; SM80-NEXT: .reg .b16 %rs<5>;
206- ; SM80-NEXT: .reg .b32 %r<4>;
207- ; SM80-NEXT: .reg .f32 %f<7>;
199+ ; SM80-NEXT: .reg .b32 %r<5>;
208200; SM80-EMPTY:
209201; SM80-NEXT: // %bb.0:
210- ; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
211- ; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
212- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
213- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
214- ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
215- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
216- ; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
217- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
218- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
219- ; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
220- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
221- ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
202+ ; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_1];
203+ ; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_0];
204+ ; SM80-NEXT: mov.b32 %r3, 1065369472;
205+ ; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1;
206+ ; SM80-NEXT: st.param.b32 [func_retval0], %r4;
222207; SM80-NEXT: ret;
223208;
224209; SM80-FTZ-LABEL: test_faddx2(
@@ -303,23 +288,14 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
303288;
304289; SM80-LABEL: test_fsubx2(
305290; SM80: {
306- ; SM80-NEXT: .reg .b16 %rs<5>;
307- ; SM80-NEXT: .reg .b32 %r<4>;
308- ; SM80-NEXT: .reg .f32 %f<7>;
291+ ; SM80-NEXT: .reg .b32 %r<5>;
309292; SM80-EMPTY:
310293; SM80-NEXT: // %bb.0:
311294; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
312295; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
313- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
314- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
315- ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
316- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
317- ; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
318- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
319- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
320- ; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
321- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
322- ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
296+ ; SM80-NEXT: mov.b32 %r3, -1082081408;
297+ ; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1;
298+ ; SM80-NEXT: st.param.b32 [func_retval0], %r4;
323299; SM80-NEXT: ret;
324300;
325301; SM80-FTZ-LABEL: test_fsubx2(
@@ -404,23 +380,14 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
404380;
405381; SM80-LABEL: test_fmulx2(
406382; SM80: {
407- ; SM80-NEXT: .reg .b16 %rs<5>;
408- ; SM80-NEXT: .reg .b32 %r<4>;
409- ; SM80-NEXT: .reg .f32 %f<7>;
383+ ; SM80-NEXT: .reg .b32 %r<5>;
410384; SM80-EMPTY:
411385; SM80-NEXT: // %bb.0:
412- ; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
413- ; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
414- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
415- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
416- ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
417- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
418- ; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
419- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
420- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
421- ; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
422- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
423- ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
386+ ; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1];
387+ ; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0];
388+ ; SM80-NEXT: mov.b32 %r3, 0;
389+ ; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r1, %r3;
390+ ; SM80-NEXT: st.param.b32 [func_retval0], %r4;
424391; SM80-NEXT: ret;
425392;
426393; SM80-FTZ-LABEL: test_fmulx2(
@@ -727,15 +694,13 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
727694;
728695; SM80-LABEL: test_fadd_imm_1(
729696; SM80: {
730- ; SM80-NEXT: .reg .b16 %rs<3>;
731- ; SM80-NEXT: .reg .f32 %f<3>;
697+ ; SM80-NEXT: .reg .b16 %rs<4>;
732698; SM80-EMPTY:
733699; SM80-NEXT: // %bb.0:
734700; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
735- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
736- ; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
737- ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
738- ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
701+ ; SM80-NEXT: mov.b16 %rs2, 0x3F80;
702+ ; SM80-NEXT: fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
703+ ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
739704; SM80-NEXT: ret;
740705;
741706; SM80-FTZ-LABEL: test_fadd_imm_1(
0 commit comments