@@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12201220; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12211221; AVX2-NEXT: vmovdqa (%rdi), %xmm1
12221222; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1223- ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1223+ ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
12241224; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
12251225; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12261226; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12341234; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12351235; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
12361236; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1237- ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1237+ ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
12381238; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12391239; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
12401240; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12471247; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12481248; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
12491249; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1250- ; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
1250+ ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
12511251; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12521252; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
12531253; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12591259; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
12601260; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
12611261; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1262- ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
12631262; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
12641263; AVX512BW-NEXT: kmovd %eax, %k1
1265- ; AVX512BW-NEXT: vmovdqu8 %ymm0 , %ymm1 {%k1}
1264+ ; AVX512BW-NEXT: vpbroadcastb %xmm0 , %ymm1 {%k1}
12661265; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
12671266; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
12681267; AVX512BW-NEXT: vzeroupper
@@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13451344; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13461345; AVX2-NEXT: vmovdqa (%rdi), %xmm1
13471346; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1348- ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1347+ ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
13491348; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
13501349; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
13511350; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13591358; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13601359; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
13611360; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1362- ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1361+ ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
13631362; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
13641363; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
13651364; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13721371; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13731372; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
13741373; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1375- ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1374+ ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
13761375; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
13771376; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
13781377; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13841383; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
13851384; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
13861385; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1387- ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
13881386; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
13891387; AVX512BW-NEXT: kmovd %eax, %k1
1390- ; AVX512BW-NEXT: vmovdqu8 %ymm0 , %ymm1 {%k1}
1388+ ; AVX512BW-NEXT: vpbroadcastb %xmm0 , %ymm1 {%k1}
13911389; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
13921390; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
13931391; AVX512BW-NEXT: vzeroupper
@@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17191717; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17201718; AVX2-NEXT: vmovdqa (%rdi), %xmm1
17211719; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1722- ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1720+ ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
17231721; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17241722; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17251723; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17321730; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17331731; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
17341732; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1735- ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1733+ ; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
17361734; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17371735; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17381736; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17451743; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17461744; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
17471745; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1748- ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1746+ ; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
17491747; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17501748; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17511749; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
26912689; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
26922690; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
26932691; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2694- ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2695- ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2696- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2697- ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2692+ ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2693+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2694+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
26982695; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
26992696; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2700- ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
27012697; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2698+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
27022699; AVX2-NEXT: vzeroupper
27032700; AVX2-NEXT: retq
27042701;
@@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27082705; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
27092706; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
27102707; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2711- ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
2712- ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
27132708; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2714- ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2709+ ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2710+ ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
27152711; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
27162712; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
27172713; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27242720; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
27252721; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
27262722; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2727- ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
2728- ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
27292723; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2730- ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2724+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2725+ ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
27312726; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
27322727; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
27332728; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27392734; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
27402735; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
27412736; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2742- ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2737+ ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
27432738; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
27442739; AVX512BW-NEXT: kmovd %eax, %k1
2745- ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
2746- ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2740+ ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
27472741; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
27482742; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
27492743; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29592953; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
29602954; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29612955; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2962- ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
2963- ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2964- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2965- ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2956+ ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2957+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2958+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
29662959; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29672960; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2968- ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
29692961; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2962+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
29702963; AVX2-NEXT: vzeroupper
29712964; AVX2-NEXT: retq
29722965;
@@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29762969; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
29772970; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29782971; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2979- ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
2980- ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
29812972; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2982- ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2973+ ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2974+ ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29832975; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
29842976; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
29852977; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29922984; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
29932985; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29942986; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2995- ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
2996- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
29972987; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2998- ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2988+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2989+ ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29992990; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
30002991; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
30012992; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
30072998; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
30082999; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
30093000; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3010- ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3001+ ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
30113002; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
30123003; AVX512BW-NEXT: kmovd %eax, %k1
3013- ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3014- ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3004+ ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
30153005; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
30163006; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
30173007; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37423732; AVX2-NEXT: vmovdqa (%rdi), %xmm0
37433733; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
37443734; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3745- ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
37463735; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3747- ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
3748- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3749- ; AVX2-NEXT: vpaddb (%rdx), %ymm0 , %ymm0
3750- ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2 , %ymm1
3751- ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3752- ; AVX2-NEXT: vmovdqa %ymm1, 32 (%rcx)
3736+ ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
3737+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3738+ ; AVX2-NEXT: vpaddb (%rdx), %ymm1 , %ymm1
3739+ ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0 , %ymm0
3740+ ; AVX2-NEXT: vmovdqa %ymm0, 32 (%rcx)
3741+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
37533742; AVX2-NEXT: vzeroupper
37543743; AVX2-NEXT: retq
37553744;
@@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37593748; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
37603749; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
37613750; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3762- ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
3763- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
37643751; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3752+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
37653753; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
37663754; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
37673755; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37753763; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
37763764; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
37773765; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3778- ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
3779- ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
37803766; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3767+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
37813768; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
37823769; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
37833770; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
0 commit comments