@@ -671,15 +671,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
671671; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
672672; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
673673; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
674- ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
675- ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
676674; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
677- ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
678675; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
676+ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
677+ ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
679678; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
680- ; AVX2-NEXT: vmovdqa %xmm0, 16(%rcx)
681- ; AVX2-NEXT: vmovdqa %xmm1, (%rcx)
679+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
680+ ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
682681; AVX2-NEXT: vmovdqa %xmm2, 32(%rcx)
682+ ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
683+ ; AVX2-NEXT: vzeroupper
683684; AVX2-NEXT: retq
684685;
685686; AVX2-FP-LABEL: store_i8_stride3_vf16:
@@ -693,15 +694,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
693694; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
694695; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
695696; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
696- ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
697- ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
698697; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
699- ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
700698; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
699+ ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
700+ ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
701701; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
702- ; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%rcx)
703- ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx)
702+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
703+ ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
704704; AVX2-FP-NEXT: vmovdqa %xmm2, 32(%rcx)
705+ ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
706+ ; AVX2-FP-NEXT: vzeroupper
705707; AVX2-FP-NEXT: retq
706708;
707709; AVX2-FCP-LABEL: store_i8_stride3_vf16:
@@ -715,15 +717,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
715717; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
716718; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
717719; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
718- ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
719- ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
720720; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
721- ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
722721; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
722+ ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
723+ ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
723724; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
724- ; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
725- ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx)
725+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
726+ ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
726727; AVX2-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
728+ ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
729+ ; AVX2-FCP-NEXT: vzeroupper
727730; AVX2-FCP-NEXT: retq
728731;
729732; AVX512-LABEL: store_i8_stride3_vf16:
0 commit comments