@@ -740,14 +740,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
740740; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
741741; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
742742; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
743- ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
744- ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
745743; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
746- ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
747744; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
745+ ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
746+ ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
748747; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749- ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
750748; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
749+ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750+ ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
751751; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
752752; AVX512-NEXT: vzeroupper
753753; AVX512-NEXT: retq
@@ -763,14 +763,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
763763; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
764764; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
765765; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
766- ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
767- ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
768766; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
769- ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
770767; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
768+ ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
769+ ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
771770; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
772- ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
773771; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
772+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
773+ ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
774774; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
775775; AVX512-FCP-NEXT: vzeroupper
776776; AVX512-FCP-NEXT: retq
@@ -786,14 +786,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
786786; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
787787; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
788788; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
789- ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
790- ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
791789; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
792- ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
793790; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
791+ ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
792+ ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
794793; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
795- ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
796794; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
795+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
796+ ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
797797; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
798798; AVX512DQ-NEXT: vzeroupper
799799; AVX512DQ-NEXT: retq
@@ -809,14 +809,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
809809; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
810810; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
811811; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
812- ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
813- ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
814812; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
815- ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
816813; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
814+ ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
815+ ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
817816; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
818- ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
819817; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
818+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
819+ ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
820820; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
821821; AVX512DQ-FCP-NEXT: vzeroupper
822822; AVX512DQ-FCP-NEXT: retq
@@ -832,14 +832,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
832832; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
833833; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
834834; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
835- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
836- ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
837835; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
838- ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
839836; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
837+ ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
838+ ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
840839; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
841- ; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
842840; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
841+ ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
842+ ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
843843; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
844844; AVX512BW-NEXT: vzeroupper
845845; AVX512BW-NEXT: retq
@@ -855,14 +855,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
855855; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
856856; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
857857; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
858- ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
859- ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
860858; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
861- ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
862859; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
860+ ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
861+ ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
863862; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
864- ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
865863; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
864+ ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
865+ ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
866866; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
867867; AVX512BW-FCP-NEXT: vzeroupper
868868; AVX512BW-FCP-NEXT: retq
@@ -878,14 +878,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
878878; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
879879; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
880880; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
881- ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
882- ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
883881; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
884- ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
885882; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
883+ ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
884+ ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
886885; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
887- ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
888886; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
887+ ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
888+ ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
889889; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
890890; AVX512DQ-BW-NEXT: vzeroupper
891891; AVX512DQ-BW-NEXT: retq
@@ -901,14 +901,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
901901; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
902902; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
903903; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
904- ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
905- ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
906904; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
907- ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
908905; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
906+ ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
907+ ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
909908; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
910- ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
911909; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
910+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
911+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
912912; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
913913; AVX512DQ-BW-FCP-NEXT: vzeroupper
914914; AVX512DQ-BW-FCP-NEXT: retq
0 commit comments