@@ -36,18 +36,18 @@ define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 {
3636;
3737; CHECK-SM70-LABEL: fma_f16_expanded_no_nans(
3838; CHECK-SM70: {
39- ; CHECK-SM70-NEXT: .reg .b16 %rs<6 >;
40- ; CHECK-SM70-NEXT: .reg .f32 %f<3 >;
39+ ; CHECK-SM70-NEXT: .reg .pred %p<2 >;
40+ ; CHECK-SM70-NEXT: .reg .b16 %rs<7 >;
4141; CHECK-SM70-EMPTY:
4242; CHECK-SM70-NEXT: // %bb.0:
4343; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_param_0];
4444; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_param_1];
4545; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_param_2];
4646; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
47- ; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4 ;
48- ; CHECK-SM70-NEXT: max.f32 %f2 , %f1, 0f00000000 ;
49- ; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5 , %f2 ;
50- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5 ;
47+ ; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000 ;
48+ ; CHECK-SM70-NEXT: setp.gt.f16 %p1 , %rs4, %rs5 ;
49+ ; CHECK-SM70-NEXT: selp.b16 %rs6 , %rs4, 0x0000, %p1 ;
50+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6 ;
5151; CHECK-SM70-NEXT: ret;
5252 %1 = fmul half %a , %b
5353 %2 = fadd half %1 , %c
@@ -94,21 +94,21 @@ define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, hal
9494;
9595; CHECK-SM70-LABEL: fma_f16_expanded_no_nans_multiple_uses_of_fma(
9696; CHECK-SM70: {
97- ; CHECK-SM70-NEXT: .reg .b16 %rs<9 >;
98- ; CHECK-SM70-NEXT: .reg .f32 %f<3 >;
97+ ; CHECK-SM70-NEXT: .reg .pred %p<2 >;
98+ ; CHECK-SM70-NEXT: .reg .b16 %rs<10 >;
9999; CHECK-SM70-EMPTY:
100100; CHECK-SM70-NEXT: // %bb.0:
101101; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_0];
102102; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_1];
103103; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_no_nans_multiple_uses_of_fma_param_2];
104104; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
105- ; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4 ;
106- ; CHECK-SM70-NEXT: max.f32 %f2 , %f1, 0f00000000 ;
107- ; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5 , %f2 ;
108- ; CHECK-SM70-NEXT: mov.b16 %rs6 , 0x4700;
109- ; CHECK-SM70-NEXT: add.f16 %rs7 , %rs4, %rs6 ;
110- ; CHECK-SM70-NEXT: add.f16 %rs8 , %rs5 , %rs7 ;
111- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs8 ;
105+ ; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000 ;
106+ ; CHECK-SM70-NEXT: setp.gt.f16 %p1 , %rs4, %rs5 ;
107+ ; CHECK-SM70-NEXT: selp.b16 %rs6 , %rs4, 0x0000, %p1 ;
108+ ; CHECK-SM70-NEXT: mov.b16 %rs7 , 0x4700;
109+ ; CHECK-SM70-NEXT: add.f16 %rs8 , %rs4, %rs7 ;
110+ ; CHECK-SM70-NEXT: add.f16 %rs9 , %rs6 , %rs8 ;
111+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs9 ;
112112; CHECK-SM70-NEXT: ret;
113113 %1 = fmul half %a , %b
114114 %2 = fadd half %1 , %c
@@ -150,18 +150,18 @@ define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 {
150150;
151151; CHECK-SM70-LABEL: fma_f16_expanded_unsafe_with_nans(
152152; CHECK-SM70: {
153- ; CHECK-SM70-NEXT: .reg .b16 %rs<6 >;
154- ; CHECK-SM70-NEXT: .reg .f32 %f<3 >;
153+ ; CHECK-SM70-NEXT: .reg .pred %p<2 >;
154+ ; CHECK-SM70-NEXT: .reg .b16 %rs<7 >;
155155; CHECK-SM70-EMPTY:
156156; CHECK-SM70-NEXT: // %bb.0:
157157; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_unsafe_with_nans_param_0];
158158; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_unsafe_with_nans_param_1];
159159; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_unsafe_with_nans_param_2];
160160; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
161- ; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4 ;
162- ; CHECK-SM70-NEXT: max.f32 %f2 , %f1, 0f00000000 ;
163- ; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5 , %f2 ;
164- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5 ;
161+ ; CHECK-SM70-NEXT: mov.b16 %rs5, 0x0000 ;
162+ ; CHECK-SM70-NEXT: setp.gt.f16 %p1 , %rs4, %rs5 ;
163+ ; CHECK-SM70-NEXT: selp.b16 %rs6 , %rs4, 0x0000, %p1 ;
164+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs6 ;
165165; CHECK-SM70-NEXT: ret;
166166 %1 = fmul half %a , %b
167167 %2 = fadd half %1 , %c
@@ -248,9 +248,9 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
248248; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans(
249249; CHECK-SM70: {
250250; CHECK-SM70-NEXT: .reg .pred %p<3>;
251- ; CHECK-SM70-NEXT: .reg .b16 %rs<3 >;
252- ; CHECK-SM70-NEXT: .reg .b32 %r<20 >;
253- ; CHECK-SM70-NEXT: .reg .f32 %f<7 >;
251+ ; CHECK-SM70-NEXT: .reg .b16 %rs<4 >;
252+ ; CHECK-SM70-NEXT: .reg .b32 %r<14 >;
253+ ; CHECK-SM70-NEXT: .reg .f32 %f<6 >;
254254; CHECK-SM70-EMPTY:
255255; CHECK-SM70-NEXT: // %bb.0:
256256; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
@@ -270,18 +270,12 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
270270; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
271271; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
272272; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
273+ ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
273274; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
274275; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
275- ; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
276- ; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
277- ; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
278- ; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
279- ; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
280- ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
281- ; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
282- ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
283- ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
284- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
276+ ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000;
277+ ; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2;
278+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3;
285279; CHECK-SM70-NEXT: ret;
286280 %1 = fmul bfloat %a , %b
287281 %2 = fadd bfloat %1 , %c
@@ -318,9 +312,9 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
318312; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans(
319313; CHECK-SM70: {
320314; CHECK-SM70-NEXT: .reg .pred %p<3>;
321- ; CHECK-SM70-NEXT: .reg .b16 %rs<3 >;
322- ; CHECK-SM70-NEXT: .reg .b32 %r<20 >;
323- ; CHECK-SM70-NEXT: .reg .f32 %f<7 >;
315+ ; CHECK-SM70-NEXT: .reg .b16 %rs<4 >;
316+ ; CHECK-SM70-NEXT: .reg .b32 %r<14 >;
317+ ; CHECK-SM70-NEXT: .reg .f32 %f<6 >;
324318; CHECK-SM70-EMPTY:
325319; CHECK-SM70-NEXT: // %bb.0:
326320; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
@@ -340,18 +334,12 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
340334; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
341335; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
342336; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
337+ ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
343338; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
344339; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
345- ; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
346- ; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
347- ; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
348- ; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
349- ; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
350- ; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
351- ; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
352- ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
353- ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
354- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
340+ ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000;
341+ ; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2;
342+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3;
355343; CHECK-SM70-NEXT: ret;
356344 %1 = fmul bfloat %a , %b
357345 %2 = fadd bfloat %1 , %c
@@ -423,9 +411,9 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
423411; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
424412; CHECK-SM70: {
425413; CHECK-SM70-NEXT: .reg .pred %p<5>;
426- ; CHECK-SM70-NEXT: .reg .b16 %rs<3 >;
427- ; CHECK-SM70-NEXT: .reg .b32 %r<34 >;
428- ; CHECK-SM70-NEXT: .reg .f32 %f<11 >;
414+ ; CHECK-SM70-NEXT: .reg .b16 %rs<7 >;
415+ ; CHECK-SM70-NEXT: .reg .b32 %r<29 >;
416+ ; CHECK-SM70-NEXT: .reg .f32 %f<10 >;
429417; CHECK-SM70-EMPTY:
430418; CHECK-SM70-NEXT: // %bb.0:
431419; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -445,38 +433,34 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
445433; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
446434; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
447435; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
436+ ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
448437; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
449438; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
450- ; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
439+ ; CHECK-SM70-NEXT: setp.gt.f32 %p2, %f5, 0f00000000;
440+ ; CHECK-SM70-NEXT: selp.b16 %rs3, %rs1, 0x0000, %p2;
441+ ; CHECK-SM70-NEXT: add.f32 %f6, %f5, 0f40E00000;
451442; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
452443; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
453444; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
454445; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
455- ; CHECK-SM70-NEXT: setp.nan.f32 %p2 , %f6, %f6;
446+ ; CHECK-SM70-NEXT: setp.nan.f32 %p3 , %f6, %f6;
456447; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
457- ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
458- ; CHECK-SM70-NEXT: add.f32 %f7, %f5, 0f40E00000;
459- ; CHECK-SM70-NEXT: mov.b32 %r20, %f7;
460- ; CHECK-SM70-NEXT: bfe.u32 %r21, %r20, 16, 1;
461- ; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r20;
462- ; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
463- ; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f7, %f7;
464- ; CHECK-SM70-NEXT: or.b32 %r24, %r20, 4194304;
465- ; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p3;
466- ; CHECK-SM70-NEXT: and.b32 %r26, %r25, -65536;
467- ; CHECK-SM70-NEXT: mov.b32 %f8, %r26;
468- ; CHECK-SM70-NEXT: and.b32 %r27, %r19, -65536;
469- ; CHECK-SM70-NEXT: mov.b32 %f9, %r27;
470- ; CHECK-SM70-NEXT: add.f32 %f10, %f9, %f8;
471- ; CHECK-SM70-NEXT: mov.b32 %r28, %f10;
472- ; CHECK-SM70-NEXT: bfe.u32 %r29, %r28, 16, 1;
473- ; CHECK-SM70-NEXT: add.s32 %r30, %r29, %r28;
474- ; CHECK-SM70-NEXT: add.s32 %r31, %r30, 32767;
475- ; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f10, %f10;
476- ; CHECK-SM70-NEXT: or.b32 %r32, %r28, 4194304;
477- ; CHECK-SM70-NEXT: selp.b32 %r33, %r32, %r31, %p4;
478- ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r33; }
479- ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
448+ ; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3;
449+ ; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs3;
450+ ; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
451+ ; CHECK-SM70-NEXT: mov.b32 %f7, %r21;
452+ ; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536;
453+ ; CHECK-SM70-NEXT: mov.b32 %f8, %r22;
454+ ; CHECK-SM70-NEXT: add.f32 %f9, %f7, %f8;
455+ ; CHECK-SM70-NEXT: mov.b32 %r23, %f9;
456+ ; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1;
457+ ; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23;
458+ ; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767;
459+ ; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f9, %f9;
460+ ; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304;
461+ ; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4;
462+ ; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r28; }
463+ ; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5;
480464; CHECK-SM70-NEXT: ret;
481465 %1 = fmul bfloat %a , %b
482466 %2 = fadd bfloat %1 , %c
0 commit comments