@@ -740,16 +740,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
740740; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
741741; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
742742; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
743+ ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
744+ ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
743745; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
746+ ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
744747; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
745- ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
746- ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
747748; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749+ ; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx)
750+ ; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
748751; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
749- ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750- ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
751- ; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
752- ; AVX512-NEXT: vzeroupper
753752; AVX512-NEXT: retq
754753;
755754; AVX512-FCP-LABEL: store_i8_stride3_vf16:
@@ -763,16 +762,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
763762; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
764763; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
765764; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
765+ ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
766+ ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
766767; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
768+ ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
767769; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
768- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
769- ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
770770; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
771+ ; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
772+ ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rcx)
771773; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
772- ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
773- ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
774- ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
775- ; AVX512-FCP-NEXT: vzeroupper
776774; AVX512-FCP-NEXT: retq
777775;
778776; AVX512DQ-LABEL: store_i8_stride3_vf16:
@@ -786,16 +784,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
786784; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
787785; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
788786; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
787+ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
788+ ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
789789; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
790+ ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
790791; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
791- ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
792- ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
793792; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
793+ ; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rcx)
794+ ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rcx)
794795; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
795- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
796- ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
797- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
798- ; AVX512DQ-NEXT: vzeroupper
799796; AVX512DQ-NEXT: retq
800797;
801798; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16:
@@ -809,16 +806,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
809806; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
810807; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
811808; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
809+ ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
810+ ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
812811; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
812+ ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
813813; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
814- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
815- ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
816814; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
815+ ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
816+ ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rcx)
817817; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
818- ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
819- ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
820- ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
821- ; AVX512DQ-FCP-NEXT: vzeroupper
822818; AVX512DQ-FCP-NEXT: retq
823819;
824820; AVX512BW-LABEL: store_i8_stride3_vf16:
@@ -832,16 +828,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
832828; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
833829; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
834830; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
831+ ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
832+ ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
835833; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
834+ ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
836835; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
837- ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
838- ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
839836; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
837+ ; AVX512BW-NEXT: vmovdqa %xmm0, 16(%rcx)
838+ ; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx)
840839; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
841- ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
842- ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
843- ; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
844- ; AVX512BW-NEXT: vzeroupper
845840; AVX512BW-NEXT: retq
846841;
847842; AVX512BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -855,16 +850,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
855850; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
856851; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
857852; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
853+ ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
854+ ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
858855; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
856+ ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
859857; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
860- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
861- ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
862858; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
859+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
860+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
863861; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
864- ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
865- ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
866- ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
867- ; AVX512BW-FCP-NEXT: vzeroupper
868862; AVX512BW-FCP-NEXT: retq
869863;
870864; AVX512DQ-BW-LABEL: store_i8_stride3_vf16:
@@ -878,16 +872,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
878872; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
879873; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
880874; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
875+ ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
876+ ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
881877; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
878+ ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
882879; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
883- ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
884- ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
885880; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
881+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 16(%rcx)
882+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx)
886883; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
887- ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
888- ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
889- ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
890- ; AVX512DQ-BW-NEXT: vzeroupper
891884; AVX512DQ-BW-NEXT: retq
892885;
893886; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -901,16 +894,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
901894; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
902895; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
903896; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
897+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
898+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
904899; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
900+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
905901; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
906- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
907- ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
908902; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
903+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
904+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
909905; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
910- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
911- ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
912- ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
913- ; AVX512DQ-BW-FCP-NEXT: vzeroupper
914906; AVX512DQ-BW-FCP-NEXT: retq
915907 %in.vec0 = load <16 x i8 >, ptr %in.vecptr0 , align 64
916908 %in.vec1 = load <16 x i8 >, ptr %in.vecptr1 , align 64
0 commit comments