@@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
24702470; AVX512BW: # %bb.0:
24712471; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
24722472; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2473- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2474- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2473+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
24752474; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
24762475; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
24772476; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
26092608; AVX512BW: # %bb.0:
26102609; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
26112610; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2612- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2613- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2611+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
26142612; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
26152613; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
26162614; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27402738; AVX512BW: # %bb.0:
27412739; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
27422740; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2743- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2744- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2741+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
27452742; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
27462743; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
27472744; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
28792876; AVX512BW: # %bb.0:
28802877; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
28812878; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2882- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2883- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2879+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
28842880; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
28852881; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
28862882; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
30103006; AVX512BW: # %bb.0:
30113007; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
30123008; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3013- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3014- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3009+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
30153010; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
30163011; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
30173012; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
31483143; AVX512BW: # %bb.0:
31493144; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
31503145; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3151- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3152- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3146+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
31533147; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
31543148; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
31553149; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32903284; AVX512BW: # %bb.0:
32913285; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
32923286; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3293- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3294- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3287+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
32953288; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
32963289; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
32973290; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
34073400; AVX512BW: # %bb.0:
34083401; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
34093402; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3410- ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3411- ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3403+ ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
34123404; AVX512BW-NEXT: movw $1, %ax
34133405; AVX512BW-NEXT: kmovd %eax, %k1
34143406; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
@@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
45654557; AVX512DQ-NEXT: vzeroupper
45664558; AVX512DQ-NEXT: retq
45674559;
4568- ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4569- ; AVX512BW: # %bb.0:
4570- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4571- ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4572- ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4573- ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4574- ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4575- ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4576- ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4577- ; AVX512BW-NEXT: vzeroupper
4578- ; AVX512BW-NEXT: retq
4560+ ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4561+ ; AVX512BW-SLOW: # %bb.0:
4562+ ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4563+ ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4564+ ; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
4565+ ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4566+ ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4567+ ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4568+ ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4569+ ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4570+ ; AVX512BW-SLOW-NEXT: vzeroupper
4571+ ; AVX512BW-SLOW-NEXT: retq
4572+ ;
4573+ ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4574+ ; AVX512BW-FAST: # %bb.0:
4575+ ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4576+ ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4577+ ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4578+ ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
4579+ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4580+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4581+ ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4582+ ; AVX512BW-FAST-NEXT: vzeroupper
4583+ ; AVX512BW-FAST-NEXT: retq
45794584 %in.vec.base = load <64 x i8 >, ptr %in.vec.base.ptr , align 64
45804585 %in.vec.bias = load <64 x i8 >, ptr %in.vec.bias.ptr , align 64
45814586 %in.vec = add <64 x i8 > %in.vec.base , %in.vec.bias
0 commit comments