@@ -3652,13 +3652,11 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
36523652; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
36533653; AVX512BW-FAST: # %bb.0:
36543654; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3655- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
36563655; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3657- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3658- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3659- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
3660- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3661- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3656+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3657+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0]
3658+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3659+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
36623660; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
36633661; AVX512BW-FAST-NEXT: vzeroupper
36643662; AVX512BW-FAST-NEXT: retq
@@ -3857,13 +3855,11 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
38573855; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
38583856; AVX512BW-FAST: # %bb.0:
38593857; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3860- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
38613858; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3862- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3863- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3864- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3865- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3866- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3859+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3860+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0]
3861+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3862+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
38673863; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
38683864; AVX512BW-FAST-NEXT: vzeroupper
38693865; AVX512BW-FAST-NEXT: retq
@@ -4085,13 +4081,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
40854081; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
40864082; AVX512BW-FAST: # %bb.0:
40874083; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4088- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
40894084; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4090- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4091- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4092- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4093- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4094- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4085+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4086+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0]
4087+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4088+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
40954089; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
40964090; AVX512BW-FAST-NEXT: vzeroupper
40974091; AVX512BW-FAST-NEXT: retq
@@ -4292,13 +4286,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
42924286; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
42934287; AVX512BW-FAST: # %bb.0:
42944288; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4295- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
42964289; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4297- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4298- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4299- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4300- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4301- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4290+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4291+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0]
4292+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4293+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
43024294; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
43034295; AVX512BW-FAST-NEXT: vzeroupper
43044296; AVX512BW-FAST-NEXT: retq
@@ -5101,32 +5093,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
51015093; AVX512DQ-NEXT: vzeroupper
51025094; AVX512DQ-NEXT: retq
51035095;
5104- ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5105- ; AVX512BW-SLOW: # %bb.0:
5106- ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5107- ; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23]
5108- ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5109- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5110- ; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
5111- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5112- ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5113- ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5114- ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5115- ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5116- ; AVX512BW-SLOW-NEXT: vzeroupper
5117- ; AVX512BW-SLOW-NEXT: retq
5118- ;
5119- ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5120- ; AVX512BW-FAST: # %bb.0:
5121- ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5122- ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5123- ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5124- ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5125- ; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5126- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5127- ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5128- ; AVX512BW-FAST-NEXT: vzeroupper
5129- ; AVX512BW-FAST-NEXT: retq
5096+ ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5097+ ; AVX512BW: # %bb.0:
5098+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5099+ ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5100+ ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5101+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5102+ ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5103+ ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5104+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5105+ ; AVX512BW-NEXT: vzeroupper
5106+ ; AVX512BW-NEXT: retq
51305107 %in.vec.base = load <64 x i8 >, ptr %in.vec.base.ptr , align 64
51315108 %in.vec.bias = load <64 x i8 >, ptr %in.vec.bias.ptr , align 64
51325109 %in.vec = add <64 x i8 > %in.vec.base , %in.vec.bias
@@ -5381,13 +5358,11 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
53815358; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
53825359; AVX512BW-SLOW: # %bb.0:
53835360; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5384- ; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11]
53855361; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5386- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5387- ; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
5388- ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5389- ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5390- ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5362+ ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5363+ ; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
5364+ ; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5365+ ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
53915366; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
53925367; AVX512BW-SLOW-NEXT: vzeroupper
53935368; AVX512BW-SLOW-NEXT: retq
0 commit comments