@@ -204,7 +204,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204204;
205205; SM80-LABEL: test_faddx2(
206206; SM80: {
207- ; SM80-NEXT: .reg .b16 %rs<5 >;
207+ ; SM80-NEXT: .reg .b16 %rs<7 >;
208208; SM80-NEXT: .reg .b32 %r<4>;
209209; SM80-NEXT: .reg .f32 %f<7>;
210210; SM80-EMPTY:
@@ -216,16 +216,18 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
216216; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217217; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
218218; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
219220; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
220221; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
221222; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
222- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
223+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
223225; SM80-NEXT: st.param.b32 [func_retval0], %r3;
224226; SM80-NEXT: ret;
225227;
226228; SM80-FTZ-LABEL: test_faddx2(
227229; SM80-FTZ: {
228- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
230+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
229231; SM80-FTZ-NEXT: .reg .b32 %r<4>;
230232; SM80-FTZ-NEXT: .reg .f32 %f<7>;
231233; SM80-FTZ-EMPTY:
@@ -237,10 +239,12 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
237239; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
238240; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
239241; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
240243; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
241244; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
242245; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
243- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
246+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
244248; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
245249; SM80-FTZ-NEXT: ret;
246250;
@@ -307,7 +311,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
307311;
308312; SM80-LABEL: test_fsubx2(
309313; SM80: {
310- ; SM80-NEXT: .reg .b16 %rs<5 >;
314+ ; SM80-NEXT: .reg .b16 %rs<7 >;
311315; SM80-NEXT: .reg .b32 %r<4>;
312316; SM80-NEXT: .reg .f32 %f<7>;
313317; SM80-EMPTY:
@@ -319,16 +323,18 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
319323; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
320324; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
321325; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
322327; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
323328; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
324329; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
325- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
330+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
326332; SM80-NEXT: st.param.b32 [func_retval0], %r3;
327333; SM80-NEXT: ret;
328334;
329335; SM80-FTZ-LABEL: test_fsubx2(
330336; SM80-FTZ: {
331- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
337+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
332338; SM80-FTZ-NEXT: .reg .b32 %r<4>;
333339; SM80-FTZ-NEXT: .reg .f32 %f<7>;
334340; SM80-FTZ-EMPTY:
@@ -340,10 +346,12 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
340346; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
341347; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
342348; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
343350; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
344351; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
345352; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
346- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
353+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
347355; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
348356; SM80-FTZ-NEXT: ret;
349357;
@@ -410,7 +418,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
410418;
411419; SM80-LABEL: test_fmulx2(
412420; SM80: {
413- ; SM80-NEXT: .reg .b16 %rs<5 >;
421+ ; SM80-NEXT: .reg .b16 %rs<7 >;
414422; SM80-NEXT: .reg .b32 %r<4>;
415423; SM80-NEXT: .reg .f32 %f<7>;
416424; SM80-EMPTY:
@@ -422,16 +430,18 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
422430; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
423431; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
424432; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
425434; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
426435; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
427436; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
428- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
437+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
429439; SM80-NEXT: st.param.b32 [func_retval0], %r3;
430440; SM80-NEXT: ret;
431441;
432442; SM80-FTZ-LABEL: test_fmulx2(
433443; SM80-FTZ: {
434- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
444+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
435445; SM80-FTZ-NEXT: .reg .b32 %r<4>;
436446; SM80-FTZ-NEXT: .reg .f32 %f<7>;
437447; SM80-FTZ-EMPTY:
@@ -443,10 +453,12 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
443453; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
444454; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
445455; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
446457; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
447458; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
448459; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
449- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
460+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
450462; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
451463; SM80-FTZ-NEXT: ret;
452464;
@@ -513,7 +525,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
513525;
514526; SM80-LABEL: test_fdiv(
515527; SM80: {
516- ; SM80-NEXT: .reg .b16 %rs<5 >;
528+ ; SM80-NEXT: .reg .b16 %rs<7 >;
517529; SM80-NEXT: .reg .b32 %r<4>;
518530; SM80-NEXT: .reg .f32 %f<7>;
519531; SM80-EMPTY:
@@ -525,16 +537,18 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525537; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
526538; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
527539; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
528541; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
529542; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
530543; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
531- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
544+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
532546; SM80-NEXT: st.param.b32 [func_retval0], %r3;
533547; SM80-NEXT: ret;
534548;
535549; SM80-FTZ-LABEL: test_fdiv(
536550; SM80-FTZ: {
537- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
551+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
538552; SM80-FTZ-NEXT: .reg .b32 %r<4>;
539553; SM80-FTZ-NEXT: .reg .f32 %f<7>;
540554; SM80-FTZ-EMPTY:
@@ -546,16 +560,18 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
546560; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
547561; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
548562; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
549564; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
550565; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
551566; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
552- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
567+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
553569; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
554570; SM80-FTZ-NEXT: ret;
555571;
556572; SM90-LABEL: test_fdiv(
557573; SM90: {
558- ; SM90-NEXT: .reg .b16 %rs<5 >;
574+ ; SM90-NEXT: .reg .b16 %rs<7 >;
559575; SM90-NEXT: .reg .b32 %r<4>;
560576; SM90-NEXT: .reg .f32 %f<7>;
561577; SM90-EMPTY:
@@ -567,10 +583,12 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
567583; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
568584; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
569585; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586+ ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
570587; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
571588; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
572589; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
573- ; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
590+ ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591+ ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
574592; SM90-NEXT: st.param.b32 [func_retval0], %r3;
575593; SM90-NEXT: ret;
576594 %r = fdiv <2 x bfloat> %a , %b
0 commit comments