@@ -4775,15 +4775,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
47754775; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20
47764776; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k1}
47774777; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm30[0,1,2,3],zmm14[4,5,6,7]
4778- ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
4779- ; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm27, %zmm27
4778+ ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm27 = zmm27[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
47804779; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm27 = zmm27[2,2,2,3,6,6,6,7]
47814780; AVX512BW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492
47824781; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2
47834782; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k2}
47844783; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm12[4,5,6,7]
4785- ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
4786- ; AVX512BW-SLOW-NEXT: vpshufb %zmm27, %zmm24, %zmm24
4784+ ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm24 = zmm24[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
47874785; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[2,2,2,3,6,6,6,7]
47884786; AVX512BW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
47894787; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3
@@ -4804,11 +4802,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
48044802; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
48054803; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1}
48064804; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14
4807- ; AVX512BW-SLOW-NEXT: vpshufb %zmm28, % zmm14, % zmm14
4805+ ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
48084806; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
48094807; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2}
48104808; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12
4811- ; AVX512BW-SLOW-NEXT: vpshufb %zmm27, % zmm12, % zmm12
4809+ ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm12 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
48124810; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
48134811; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3}
48144812; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -4882,24 +4880,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
48824880; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8
48834881; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10
48844882; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
4885- ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
4886- ; AVX512BW-FAST-NEXT: vpshufb %ymm6 , %ymm3, %ymm0
4883+ ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
4884+ ; AVX512BW-FAST-NEXT: vpshufb %ymm5 , %ymm3, %ymm0
48874885; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
4888- ; AVX512BW-FAST-NEXT: vpshufb %ymm6 , %ymm4, %ymm1
4886+ ; AVX512BW-FAST-NEXT: vpshufb %ymm5 , %ymm4, %ymm1
48894887; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
48904888; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
48914889; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
48924890; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
48934891; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1
48944892; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
4895- ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5
4893+ ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm6
48964894; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm17 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
4897- ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5 , %ymm0
4895+ ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm6 , %ymm0
48984896; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7
48994897; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm7, %ymm2
49004898; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
49014899; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
4902- ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5 [8],ymm7[9],ymm5 [9],ymm7[10],ymm5 [10],ymm7[11],ymm5 [11],ymm7[12],ymm5 [12],ymm7[13],ymm5 [13],ymm7[14],ymm5 [14],ymm7[15],ymm5 [15],ymm7[24],ymm5 [24],ymm7[25],ymm5 [25],ymm7[26],ymm5 [26],ymm7[27],ymm5 [27],ymm7[28],ymm5 [28],ymm7[29],ymm5 [29],ymm7[30],ymm5 [30],ymm7[31],ymm5 [31]
4900+ ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm6 [8],ymm7[9],ymm6 [9],ymm7[10],ymm6 [10],ymm7[11],ymm6 [11],ymm7[12],ymm6 [12],ymm7[13],ymm6 [13],ymm7[14],ymm6 [14],ymm7[15],ymm6 [15],ymm7[24],ymm6 [24],ymm7[25],ymm6 [25],ymm7[26],ymm6 [26],ymm7[27],ymm6 [27],ymm7[28],ymm6 [28],ymm7[29],ymm6 [29],ymm7[30],ymm6 [30],ymm7[31],ymm6 [31]
49034901; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
49044902; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm12, %ymm2
49054903; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -4908,49 +4906,47 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
49084906; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
49094907; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2
49104908; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7]
4911- ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
4912- ; AVX512BW-FAST-NEXT: vpshufb %zmm13, %zmm1, %zmm1
4909+ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
49134910; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
49144911; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492
49154912; AVX512BW-FAST-NEXT: kmovd %eax, %k2
49164913; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
49174914; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1
49184915; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7]
4919- ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
4920- ; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm11, %zmm11
4916+ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm11[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
49214917; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7]
49224918; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
49234919; AVX512BW-FAST-NEXT: kmovq %rax, %k3
49244920; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3}
49254921; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11
4926- ; AVX512BW-FAST-NEXT: vpshufb %ymm6 , %ymm11, %ymm16
4922+ ; AVX512BW-FAST-NEXT: vpshufb %ymm5 , %ymm11, %ymm13
49274923; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14
4928- ; AVX512BW-FAST-NEXT: vpshufb %ymm6 , %ymm14, %ymm6
4929- ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6 [0],ymm16 [0],ymm6 [1],ymm16 [1],ymm6 [2],ymm16 [2],ymm6 [3],ymm16 [3],ymm6 [4],ymm16 [4],ymm6 [5],ymm16 [5],ymm6 [6],ymm16 [6],ymm6 [7],ymm16 [7],ymm6 [16],ymm16 [16],ymm6 [17],ymm16 [17],ymm6 [18],ymm16 [18],ymm6 [19],ymm16 [19],ymm6 [20],ymm16 [20],ymm6 [21],ymm16 [21],ymm6 [22],ymm16 [22],ymm6 [23],ymm16 [23]
4930- ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6 [2,2,2,3]
4931- ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31]
4932- ; AVX512BW-FAST-NEXT: vpermw %ymm16 , %ymm9, %ymm9
4933- ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6 , %zmm9
4924+ ; AVX512BW-FAST-NEXT: vpshufb %ymm5 , %ymm14, %ymm5
4925+ ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5 [0],ymm13 [0],ymm5 [1],ymm13 [1],ymm5 [2],ymm13 [2],ymm5 [3],ymm13 [3],ymm5 [4],ymm13 [4],ymm5 [5],ymm13 [5],ymm5 [6],ymm13 [6],ymm5 [7],ymm13 [7],ymm5 [16],ymm13 [16],ymm5 [17],ymm13 [17],ymm5 [18],ymm13 [18],ymm5 [19],ymm13 [19],ymm5 [20],ymm13 [20],ymm5 [21],ymm13 [21],ymm5 [22],ymm13 [22],ymm5 [23],ymm13 [23]
4926+ ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5 [2,2,2,3]
4927+ ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31]
4928+ ; AVX512BW-FAST-NEXT: vpermw %ymm13 , %ymm9, %ymm9
4929+ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5 , %zmm9
49344930; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19
4935- ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm19, %ymm6
4931+ ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm19, %ymm5
49364932; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm20
4937- ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm20, %ymm16
4938- ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm16 [0],ymm6 [0],ymm16 [1],ymm6 [1],ymm16 [2],ymm6 [2],ymm16 [3],ymm6 [3],ymm16 [4],ymm6 [4],ymm16 [5],ymm6 [5],ymm16 [6],ymm6 [6],ymm16 [7],ymm6 [7],ymm16 [16],ymm6 [16],ymm16 [17],ymm6 [17],ymm16 [18],ymm6 [18],ymm16 [19],ymm6 [19],ymm16 [20],ymm6 [20],ymm16 [21],ymm6 [21],ymm16 [22],ymm6 [22],ymm16 [23],ymm6 [23]
4939- ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6 [2,2,2,3]
4940- ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm20[8],ymm19[8],ymm20[9],ymm19[9],ymm20[10],ymm19[10],ymm20[11],ymm19[11],ymm20[12],ymm19[12],ymm20[13],ymm19[13],ymm20[14],ymm19[14],ymm20[15],ymm19[15],ymm20[24],ymm19[24],ymm20[25],ymm19[25],ymm20[26],ymm19[26],ymm20[27],ymm19[27],ymm20[28],ymm19[28],ymm20[29],ymm19[29],ymm20[30],ymm19[30],ymm20[31],ymm19[31]
4941- ; AVX512BW-FAST-NEXT: vpermw %ymm16 , %ymm12, %ymm12
4942- ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6 , %zmm6
4943- ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1}
4933+ ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm20, %ymm13
4934+ ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm13 [0],ymm5 [0],ymm13 [1],ymm5 [1],ymm13 [2],ymm5 [2],ymm13 [3],ymm5 [3],ymm13 [4],ymm5 [4],ymm13 [5],ymm5 [5],ymm13 [6],ymm5 [6],ymm13 [7],ymm5 [7],ymm13 [16],ymm5 [16],ymm13 [17],ymm5 [17],ymm13 [18],ymm5 [18],ymm13 [19],ymm5 [19],ymm13 [20],ymm5 [20],ymm13 [21],ymm5 [21],ymm13 [22],ymm5 [22],ymm13 [23],ymm5 [23]
4935+ ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5 [2,2,2,3]
4936+ ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm20[8],ymm19[8],ymm20[9],ymm19[9],ymm20[10],ymm19[10],ymm20[11],ymm19[11],ymm20[12],ymm19[12],ymm20[13],ymm19[13],ymm20[14],ymm19[14],ymm20[15],ymm19[15],ymm20[24],ymm19[24],ymm20[25],ymm19[25],ymm20[26],ymm19[26],ymm20[27],ymm19[27],ymm20[28],ymm19[28],ymm20[29],ymm19[29],ymm20[30],ymm19[30],ymm20[31],ymm19[31]
4937+ ; AVX512BW-FAST-NEXT: vpermw %ymm13 , %ymm12, %ymm12
4938+ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5 , %zmm5
4939+ ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm5 {%k1}
49444940; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9
49454941; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4946- ; AVX512BW-FAST-NEXT: vpshufb %zmm13, % zmm8, % zmm8
4942+ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
49474943; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
4948- ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2}
4944+ ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm5 {%k2}
49494945; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8
49504946; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10
4951- ; AVX512BW-FAST-NEXT: vpshufb %zmm15, % zmm10, % zmm10
4947+ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
49524948; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7]
4953- ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3}
4949+ ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm5 {%k3}
49544950; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22
49554951; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm13
49564952; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm13, %xmm10
@@ -5029,17 +5025,17 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50295025; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11
50305026; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30]
50315027; AVX512BW-FAST-NEXT: vpermw %zmm19, %zmm20, %zmm11 {%k1}
5032- ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5 [0],ymm7[1],ymm5 [1],ymm7[2],ymm5 [2],ymm7[3],ymm5 [3],ymm7[4],ymm5 [4],ymm7[5],ymm5 [5],ymm7[6],ymm5 [6],ymm7[7],ymm5 [7],ymm7[16],ymm5 [16],ymm7[17],ymm5 [17],ymm7[18],ymm5 [18],ymm7[19],ymm5 [19],ymm7[20],ymm5 [20],ymm7[21],ymm5 [21],ymm7[22],ymm5 [22],ymm7[23],ymm5 [23]
5028+ ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6 [0],ymm7[1],ymm6 [1],ymm7[2],ymm6 [2],ymm7[3],ymm6 [3],ymm7[4],ymm6 [4],ymm7[5],ymm6 [5],ymm7[6],ymm6 [6],ymm7[7],ymm6 [7],ymm7[16],ymm6 [16],ymm7[17],ymm6 [17],ymm7[18],ymm6 [18],ymm7[19],ymm6 [19],ymm7[20],ymm6 [20],ymm7[21],ymm6 [21],ymm7[22],ymm6 [22],ymm7[23],ymm6 [23]
50335029; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
5034- ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5 , %zmm7, %zmm5
5030+ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6 , %zmm7, %zmm6
50355031; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
50365032; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15]
50375033; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
50385034; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3
50395035; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5040- ; AVX512BW-FAST-NEXT: vpermw %zmm5 , %zmm20, %zmm3 {%k1}
5041- ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
5042- ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5 , %ymm4
5036+ ; AVX512BW-FAST-NEXT: vpermw %zmm6 , %zmm20, %zmm3 {%k1}
5037+ ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
5038+ ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6 , %ymm4
50435039; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
50445040; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1]
50455041; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9
@@ -5049,7 +5045,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50495045; AVX512BW-FAST-NEXT: kmovd %eax, %k1
50505046; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1}
50515047; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5052- ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5 , %ymm4
5048+ ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6 , %ymm4
50535049; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
50545050; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
50555051; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8
@@ -5060,12 +5056,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50605056; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2}
50615057; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2
50625058; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5063- ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5 , %ymm4
5059+ ; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6 , %ymm4
50645060; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
50655061; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
50665062; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
50675063; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5068- ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5 , %ymm2
5064+ ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm6 , %ymm2
50695065; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1
50705066; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
50715067; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
@@ -5075,7 +5071,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50755071; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax)
50765072; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, (%rax)
50775073; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
5078- ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6 , 128(%rax)
5074+ ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5 , 128(%rax)
50795075; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax)
50805076; AVX512BW-FAST-NEXT: vzeroupper
50815077; AVX512BW-FAST-NEXT: retq
0 commit comments