@@ -204,47 +204,43 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204204;
205205; SM80-LABEL: test_faddx2(
206206; SM80: {
207- ; SM80-NEXT: .reg .b16 %rs<7 >;
207+ ; SM80-NEXT: .reg .b16 %rs<5 >;
208208; SM80-NEXT: .reg .b32 %r<4>;
209209; SM80-NEXT: .reg .f32 %f<7>;
210210; SM80-EMPTY:
211211; SM80-NEXT: // %bb.0:
212212; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
213213; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
214214; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
215- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
215+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
216216; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
217+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
218218; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
219+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
220+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
222221; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
222+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
225223; SM80-NEXT: st.param.b32 [func_retval0], %r3;
226224; SM80-NEXT: ret;
227225;
228226; SM80-FTZ-LABEL: test_faddx2(
229227; SM80-FTZ: {
230- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
228+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
231229; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232230; SM80-FTZ-NEXT: .reg .f32 %f<7>;
233231; SM80-FTZ-EMPTY:
234232; SM80-FTZ-NEXT: // %bb.0:
235233; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
236234; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
237235; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
238- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
236+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
239237; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
238+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
241239; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
240+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
241+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
245242; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
243+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
248244; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249245; SM80-FTZ-NEXT: ret;
250246;
@@ -311,47 +307,43 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
311307;
312308; SM80-LABEL: test_fsubx2(
313309; SM80: {
314- ; SM80-NEXT: .reg .b16 %rs<7 >;
310+ ; SM80-NEXT: .reg .b16 %rs<5 >;
315311; SM80-NEXT: .reg .b32 %r<4>;
316312; SM80-NEXT: .reg .f32 %f<7>;
317313; SM80-EMPTY:
318314; SM80-NEXT: // %bb.0:
319315; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
320316; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
321317; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
322- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
318+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
323319; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
320+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
325321; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
322+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
323+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
329324; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
325+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
332326; SM80-NEXT: st.param.b32 [func_retval0], %r3;
333327; SM80-NEXT: ret;
334328;
335329; SM80-FTZ-LABEL: test_fsubx2(
336330; SM80-FTZ: {
337- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
331+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
338332; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339333; SM80-FTZ-NEXT: .reg .f32 %f<7>;
340334; SM80-FTZ-EMPTY:
341335; SM80-FTZ-NEXT: // %bb.0:
342336; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
343337; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
344338; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
345- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
339+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
346340; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
341+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
348342; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
343+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
344+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
352345; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
346+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
355347; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356348; SM80-FTZ-NEXT: ret;
357349;
@@ -418,47 +410,43 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
418410;
419411; SM80-LABEL: test_fmulx2(
420412; SM80: {
421- ; SM80-NEXT: .reg .b16 %rs<7 >;
413+ ; SM80-NEXT: .reg .b16 %rs<5 >;
422414; SM80-NEXT: .reg .b32 %r<4>;
423415; SM80-NEXT: .reg .f32 %f<7>;
424416; SM80-EMPTY:
425417; SM80-NEXT: // %bb.0:
426418; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
427419; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
428420; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
429- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
421+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
430422; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
423+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
432424; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
425+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
426+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
436427; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
428+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
439429; SM80-NEXT: st.param.b32 [func_retval0], %r3;
440430; SM80-NEXT: ret;
441431;
442432; SM80-FTZ-LABEL: test_fmulx2(
443433; SM80-FTZ: {
444- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
434+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
445435; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446436; SM80-FTZ-NEXT: .reg .f32 %f<7>;
447437; SM80-FTZ-EMPTY:
448438; SM80-FTZ-NEXT: // %bb.0:
449439; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
450440; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
451441; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
452- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
442+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
453443; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
444+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
455445; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
446+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
447+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
459448; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
449+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
462450; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463451; SM80-FTZ-NEXT: ret;
464452;
@@ -525,70 +513,64 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525513;
526514; SM80-LABEL: test_fdiv(
527515; SM80: {
528- ; SM80-NEXT: .reg .b16 %rs<7 >;
516+ ; SM80-NEXT: .reg .b16 %rs<5 >;
529517; SM80-NEXT: .reg .b32 %r<4>;
530518; SM80-NEXT: .reg .f32 %f<7>;
531519; SM80-EMPTY:
532520; SM80-NEXT: // %bb.0:
533521; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
534522; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
535523; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
536- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
524+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
537525; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
526+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
539527; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
528+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
529+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
543530; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
531+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
546532; SM80-NEXT: st.param.b32 [func_retval0], %r3;
547533; SM80-NEXT: ret;
548534;
549535; SM80-FTZ-LABEL: test_fdiv(
550536; SM80-FTZ: {
551- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
537+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
552538; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553539; SM80-FTZ-NEXT: .reg .f32 %f<7>;
554540; SM80-FTZ-EMPTY:
555541; SM80-FTZ-NEXT: // %bb.0:
556542; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
557543; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
558544; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
559- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
545+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
560546; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
547+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
562548; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
549+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
550+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
566551; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
552+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
569553; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570554; SM80-FTZ-NEXT: ret;
571555;
572556; SM90-LABEL: test_fdiv(
573557; SM90: {
574- ; SM90-NEXT: .reg .b16 %rs<7 >;
558+ ; SM90-NEXT: .reg .b16 %rs<5 >;
575559; SM90-NEXT: .reg .b32 %r<4>;
576560; SM90-NEXT: .reg .f32 %f<7>;
577561; SM90-EMPTY:
578562; SM90-NEXT: // %bb.0:
579563; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
580564; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
581565; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
582- ; SM90-NEXT: cvt.f32.bf16 %f1, %rs2 ;
566+ ; SM90-NEXT: cvt.f32.bf16 %f1, %rs1 ;
583567; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584- ; SM90-NEXT: cvt.f32.bf16 %f2, %rs4 ;
568+ ; SM90-NEXT: cvt.f32.bf16 %f2, %rs3 ;
585569; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586- ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587- ; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588- ; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
570+ ; SM90-NEXT: cvt.f32.bf16 %f4, %rs2;
571+ ; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
589572; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590- ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591- ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
573+ ; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
592574; SM90-NEXT: st.param.b32 [func_retval0], %r3;
593575; SM90-NEXT: ret;
594576 %r = fdiv <2 x bfloat> %a , %b
0 commit comments