@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
762762; SM70-NEXT: // %bb.0:
763763; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764764; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765- ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
766- ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
767- ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
768- ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
769- ; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
765+ ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
766+ ; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
770767; SM70-NEXT: shl.b32 %r6, %r5, 16;
771768; SM70-NEXT: mov.b32 %f1, %r6;
772- ; SM70-NEXT: cvt.u32.u16 %r7, %rs7 ;
769+ ; SM70-NEXT: cvt.u32.u16 %r7, %rs1 ;
773770; SM70-NEXT: shl.b32 %r8, %r7, 16;
774771; SM70-NEXT: mov.b32 %f2, %r8;
775- ; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
772+ ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
773+ ; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
776774; SM70-NEXT: shl.b32 %r10, %r9, 16;
777775; SM70-NEXT: mov.b32 %f3, %r10;
778- ; SM70-NEXT: cvt.u32.u16 %r11, %rs5 ;
776+ ; SM70-NEXT: cvt.u32.u16 %r11, %rs3 ;
779777; SM70-NEXT: shl.b32 %r12, %r11, 16;
780778; SM70-NEXT: mov.b32 %f4, %r12;
781- ; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
779+ ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
780+ ; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
782781; SM70-NEXT: shl.b32 %r14, %r13, 16;
783782; SM70-NEXT: mov.b32 %f5, %r14;
784- ; SM70-NEXT: cvt.u32.u16 %r15, %rs3 ;
783+ ; SM70-NEXT: cvt.u32.u16 %r15, %rs5 ;
785784; SM70-NEXT: shl.b32 %r16, %r15, 16;
786785; SM70-NEXT: mov.b32 %f6, %r16;
787- ; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
786+ ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
787+ ; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
788788; SM70-NEXT: shl.b32 %r18, %r17, 16;
789789; SM70-NEXT: mov.b32 %f7, %r18;
790- ; SM70-NEXT: cvt.u32.u16 %r19, %rs1 ;
790+ ; SM70-NEXT: cvt.u32.u16 %r19, %rs7 ;
791791; SM70-NEXT: shl.b32 %r20, %r19, 16;
792792; SM70-NEXT: mov.b32 %f8, %r20;
793793; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
804804; SM80-NEXT: // %bb.0:
805805; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806806; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807- ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
808- ; SM80-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
809- ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
810- ; SM80-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
811- ; SM80-NEXT: cvt.f32.bf16 %f1 , %rs8 ;
812- ; SM80-NEXT: cvt.f32.bf16 %f2 , %rs7 ;
813- ; SM80-NEXT: cvt.f32.bf16 %f3 , %rs6;
814- ; SM80-NEXT: cvt.f32.bf16 %f4 , %rs5 ;
815- ; SM80-NEXT: cvt.f32.bf16 %f5 , %rs4 ;
816- ; SM80-NEXT: cvt.f32.bf16 %f6 , %rs3 ;
817- ; SM80-NEXT: cvt.f32.bf16 %f7, %rs2 ;
818- ; SM80-NEXT: cvt.f32.bf16 %f8, %rs1 ;
807+ ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
808+ ; SM80-NEXT: cvt.f32.bf16 %f1 , %rs2 ;
809+ ; SM80-NEXT: cvt.f32.bf16 %f2 , %rs1 ;
810+ ; SM80-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
811+ ; SM80-NEXT: cvt.f32.bf16 %f3 , %rs4 ;
812+ ; SM80-NEXT: cvt.f32.bf16 %f4 , %rs3 ;
813+ ; SM80-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
814+ ; SM80-NEXT: cvt.f32.bf16 %f5 , %rs6 ;
815+ ; SM80-NEXT: cvt.f32.bf16 %f6 , %rs5 ;
816+ ; SM80-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
817+ ; SM80-NEXT: cvt.f32.bf16 %f7, %rs8 ;
818+ ; SM80-NEXT: cvt.f32.bf16 %f8, %rs7 ;
819819; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820820; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821821; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
830830; SM80-FTZ-NEXT: // %bb.0:
831831; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832832; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833- ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
834- ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
835- ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
836- ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
837- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1 , %rs8 ;
838- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2 , %rs7 ;
839- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3 , %rs6;
840- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4 , %rs5 ;
841- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5 , %rs4 ;
842- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6 , %rs3 ;
843- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2 ;
844- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1 ;
833+ ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
834+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1 , %rs2 ;
835+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2 , %rs1 ;
836+ ; SM80-FTZ-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
837+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3 , %rs4 ;
838+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4 , %rs3 ;
839+ ; SM80-FTZ-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
840+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5 , %rs6 ;
841+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6 , %rs5 ;
842+ ; SM80-FTZ-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
843+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8 ;
844+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7 ;
845845; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846846; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847847; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
856856; SM90-NEXT: // %bb.0:
857857; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858858; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859- ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1 ;
860- ; SM90-NEXT: mov.b32 {%rs3 , %rs4}, %r2 ;
861- ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r3 ;
862- ; SM90-NEXT: mov.b32 {%rs7 , %rs8 }, %r4 ;
863- ; SM90-NEXT: cvt.f32.bf16 %f1 , %rs8 ;
864- ; SM90-NEXT: cvt.f32.bf16 %f2 , %rs7 ;
865- ; SM90-NEXT: cvt.f32.bf16 %f3 , %rs6;
866- ; SM90-NEXT: cvt.f32.bf16 %f4 , %rs5 ;
867- ; SM90-NEXT: cvt.f32.bf16 %f5 , %rs4 ;
868- ; SM90-NEXT: cvt.f32.bf16 %f6 , %rs3 ;
869- ; SM90-NEXT: cvt.f32.bf16 %f7, %rs2 ;
870- ; SM90-NEXT: cvt.f32.bf16 %f8, %rs1 ;
859+ ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4 ;
860+ ; SM90-NEXT: cvt.f32.bf16 %f1 , %rs2 ;
861+ ; SM90-NEXT: cvt.f32.bf16 %f2 , %rs1 ;
862+ ; SM90-NEXT: mov.b32 {%rs3 , %rs4 }, %r3 ;
863+ ; SM90-NEXT: cvt.f32.bf16 %f3 , %rs4 ;
864+ ; SM90-NEXT: cvt.f32.bf16 %f4 , %rs3 ;
865+ ; SM90-NEXT: mov.b32 {%rs5 , %rs6}, %r2 ;
866+ ; SM90-NEXT: cvt.f32.bf16 %f5 , %rs6 ;
867+ ; SM90-NEXT: cvt.f32.bf16 %f6 , %rs5 ;
868+ ; SM90-NEXT: mov.b32 {%rs7 , %rs8}, %r1 ;
869+ ; SM90-NEXT: cvt.f32.bf16 %f7, %rs8 ;
870+ ; SM90-NEXT: cvt.f32.bf16 %f8, %rs7 ;
871871; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872872; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873873; SM90-NEXT: ret;
0 commit comments